Esempio n. 1
0
	int ICUUnicodeSupport::_compareNoCase<2>(ConstStringHolder<2> _first, ConstStringHolder<2> _second)
	{
		int32_t len1 = _first.length();
		int32_t len2 = _second.length();
		int32_t ofs1 = 0;
		int32_t ofs2 = 0;

		int r = checkStringEnd(ofs1, len1, ofs2, len2);
		if(r != 2) return r;

		const uint16_t* buf1 = _first.c_str();
		const uint16_t* buf2 = _second.c_str();
		while(true)
		{
			UChar32 c1, c2;
			U16_NEXT(buf1, ofs1, len1, c1);
			U16_NEXT(buf2, ofs2, len2, c2);
			
			c1 = u_tolower(c1);
			c2 = u_tolower(c2);
			if(c1 != c2)
				return (c1 < c2) ? -1 : 1;

			r = checkStringEnd(ofs1, len1, ofs2, len2);
			if(r != 2) return r;
		}
	}
void lemmatize(struct dela_entry* e,struct string_hash_ptr* keywords,Alphabet* alphabet) {
unichar* lower=u_strdup(e->inflected);
u_tolower(lower);
KeyWord* k_inflected=(KeyWord*)get_value(lower,keywords);
free(lower);
if (k_inflected==NULL) return;
Ustring* tmp=new_Ustring(64);
u_sprintf(tmp,"%S.%S",e->lemma,e->semantic_codes[0]);
KeyWord* k_lemma=(KeyWord*)get_value(tmp->str,keywords);
if (k_lemma==NULL) {
	k_lemma=new_KeyWord(0,tmp->str,NULL);
	k_lemma->lemmatized=LEMMATIZED_KEYWORD;
	get_value_index(tmp->str,keywords,INSERT_IF_NEEDED,k_lemma);
}
/* Now, we look for all the case compatible tokens, and we add
 * their weights to the new lemmatized element
 */
while (k_inflected!=NULL) {
	if (k_inflected->sequence!=NULL && is_equal_or_uppercase(e->inflected,k_inflected->sequence,alphabet)) {
		/* We have a match */
		k_lemma->weight+=k_inflected->weight;
		k_inflected->lemmatized=1;
	}
	k_inflected=k_inflected->next;
}
free_Ustring(tmp);
}
/**
 * Loads a compound word file, adding each word to the keywords.
 */
void load_compound_words(char* name,VersatileEncodingConfig* vec,
		struct string_hash_ptr* keywords) {
U_FILE* f=u_fopen(vec,name,U_READ);
if (f==NULL) return;
Ustring* line=new_Ustring(256);
Ustring* lower=new_Ustring(256);
while (EOF!=readline(line,f)) {
	if (line->str[0]=='{') {
		/* We skip tags */
		continue;
	}
	u_strcpy(lower,line->str);
	u_tolower(lower->str);
	int index=get_value_index(lower->str,keywords,INSERT_IF_NEEDED,NULL);
	if (index==-1) {
		fatal_error("Internal error in load_tokens_by_freq\n");
	}
	KeyWord* value=(KeyWord*)keywords->value[index];
	add_keyword(&value,line->str,1);
	keywords->value[index]=value;
}
free_Ustring(line);
free_Ustring(lower);
u_fclose(f);
}
Esempio n. 4
0
File: utf.cpp Progetto: MGKhKhD/meta
std::string tolower(const std::string& str)
{
    return transform(str, [](uint32_t cp)
                     {
                         return u_tolower(static_cast<UChar32>(cp));
                     });
}
/**
 * Loads the initial keyword list from a tok_by_freq.txt file,
 * and turns all those tokens in a list whose primary key is the
 * lower case token:
 * The/20 THE/2 the/50 => the->(The/20 THE/2 the/50)
 */
struct string_hash_ptr* load_tokens_by_freq(char* name,VersatileEncodingConfig* vec) {
U_FILE* f=u_fopen(vec,name,U_READ);
if (f==NULL) return NULL;
Ustring* line=new_Ustring(128);
Ustring* lower=new_Ustring(128);
struct string_hash_ptr* res=new_string_hash_ptr(1024);
int val,pos;
/* We skip the first line of the file, containing the number
 * of tokens
 */
if (EOF==readline(line,f)) {
	fatal_error("Invalid empty file %s\n",name);
}
while (EOF!=readline(line,f)) {
	if (1!=u_sscanf(line->str,"%d%n",&val,&pos)) {
		fatal_error("Invalid line in file %s:\n%S\n",name,line->str);
	}
	u_strcpy(lower,line->str+pos);
	u_tolower(lower->str);
	int index=get_value_index(lower->str,res,INSERT_IF_NEEDED,NULL);
	if (index==-1) {
		fatal_error("Internal error in load_tokens_by_freq\n");
	}
	KeyWord* value=(KeyWord*)res->value[index];
	res->value[index]=new_KeyWord(val,line->str+pos,value);
}
free_Ustring(line);
free_Ustring(lower);
u_fclose(f);
return res;
}
Esempio n. 6
0
static PyObject* icu_swap_case(PyObject *self, PyObject *input) {
    PyObject *result = NULL;
    UErrorCode status = U_ZERO_ERROR;
    UChar *input_buf = NULL, *output_buf = NULL;
    UChar32 *buf = NULL;
    int32_t sz = 0, sz32 = 0, i = 0;

    input_buf = python_to_icu(input, &sz);
    if (input_buf == NULL) goto end;
    output_buf = (UChar*) calloc(3 * sz, sizeof(UChar));
    buf = (UChar32*) calloc(2 * sz, sizeof(UChar32));
    if (output_buf == NULL || buf == NULL) { PyErr_NoMemory(); goto end; }
    u_strToUTF32(buf, 2 * sz, &sz32, input_buf, sz, &status);

    for (i = 0; i < sz32; i++) {
        if (u_islower(buf[i])) buf[i] = u_toupper(buf[i]);
        else if (u_isupper(buf[i])) buf[i] = u_tolower(buf[i]);
    }
    u_strFromUTF32(output_buf, 3*sz, &sz, buf, sz32, &status);
    if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; }
    result = icu_to_python(output_buf, sz);

end:
    if (input_buf != NULL) free(input_buf);
    if (output_buf != NULL) free(output_buf);
    if (buf != NULL) free(buf);
    return result;

} // }}}
Esempio n. 7
0
/**
 * Returns 2 if 'c' is considered as a lowercase letter
 * in the given alphabet, 0 otherwise.
 */
int is_lower(unichar c,const Alphabet* alphabet) {
if (alphabet==NULL) {
    if (u_is_letter(c) == 0)
        return 0;
    return (c == u_tolower(c)) ? 2 : 0;
}
return IS_LOWER_MACRO(c,alphabet);
}
Esempio n. 8
0
// ---------------------------------------------------------------------------
//  RangeToken: Getter methods
// ---------------------------------------------------------------------------
RangeToken* RangeToken::getCaseInsensitiveToken(TokenFactory* const tokFactory) {

    if (fCaseIToken == 0 && tokFactory) {

        bool isNRange = (getTokenType() == T_NRANGE) ? true : false;
        RangeToken* lwrToken = tokFactory->createRange(isNRange);

        for (unsigned int i = 0;  i < fElemCount - 1;  i += 2) {
            for (XMLInt32 ch = fRanges[i];  ch <= fRanges[i + 1];  ++ch) {
#if defined(XML_USE_ICU_TRANSCODER) || defined (XML_USE_UNICONV390_TRANSCODER)
                const XMLInt32  upperCh = u_toupper(ch);

                if (upperCh != ch)
                {
                    lwrToken->addRange(upperCh, upperCh);
                }

                const XMLInt32  lowerCh = u_tolower(ch);

                if (lowerCh != ch)
                {
                    lwrToken->addRange(lowerCh, lowerCh);
                }

                const XMLInt32  titleCh = u_totitle(ch);

                if (titleCh != ch && titleCh != upperCh)
                {
                    lwrToken->addRange(titleCh, titleCh);
                }
#else
                if (ch >= chLatin_A && ch <= chLatin_Z)
                {
                    ch += chLatin_a - chLatin_A;

                    lwrToken->addRange(ch, ch);
                }
                else if (ch >= chLatin_a && ch <= chLatin_z)
                {
                    ch -= chLatin_a - chLatin_A;

                    lwrToken->addRange(ch, ch);
                }
#endif
            }
        }

        lwrToken->mergeRanges(this);
        lwrToken->compactRanges();
        lwrToken->createMap();

        fCaseIToken = lwrToken;
    }

    return fCaseIToken;
}
Esempio n. 9
0
UChar UChar::toLower() const
{
#if APPLE_CHANGES
  return static_cast<unsigned short>(u_tolower(uc));
#else
  // ### properly support unicode tolower
  if (uc >= 256 || islower(uc))
    return *this;

  return (unsigned char)tolower(uc);
#endif
}
Esempio n. 10
0
void remove_keyword(unichar* keyword,struct string_hash_ptr* keywords) {
unichar* lower=u_strdup(keyword);
u_tolower(lower);
KeyWord* k=(KeyWord*)get_value(lower,keywords);
free(lower);
if (k==NULL) return;
while (k!=NULL) {
	if (k->sequence!=NULL && !u_strcmp(keyword,k->sequence)) {
		free(k->sequence);
		k->sequence=NULL;
		return;
	}
	k=k->next;
}
}
Esempio n. 11
0
inline static UBool 
startsWithPrefix(const UChar* src , int32_t srcLength){
    UBool startsWithPrefix = TRUE;

    if(srcLength < ACE_PREFIX_LENGTH){
        return FALSE;
    }

    for(int8_t i=0; i< ACE_PREFIX_LENGTH; i++){
        if(u_tolower(src[i]) != ACE_PREFIX[i]){
            startsWithPrefix = FALSE;
        }
    }
    return startsWithPrefix;
}
Esempio n. 12
0
	void ICUUnicodeSupport::_toLowerCase<2>(StringHolder<2> _str)
	{
		if(!_str.empty())
		{
			uint16_t* buf = &_str[0];
			int32_t len = _str.length();
			int32_t ofs = 0, ofs2 = 0;
			while(ofs != len)
			{
				UChar32 c;
				U16_NEXT(buf, ofs, len, c);
				c = u_tolower(c);
				U16_APPEND_UNSAFE( buf, ofs2, c);
			}
		}
	}
Esempio n. 13
0
/**
 * Parse a pattern string starting at offset pos.  Keywords are
 * matched case-insensitively.  Spaces may be skipped and may be
 * optional or required.  Integer values may be parsed, and if
 * they are, they will be returned in the given array.  If
 * successful, the offset of the next non-space character is
 * returned.  On failure, -1 is returned.
 * @param pattern must only contain lowercase characters, which
 * will match their uppercase equivalents as well.  A space
 * character matches one or more required spaces.  A '~' character
 * matches zero or more optional spaces.  A '#' character matches
 * an integer and stores it in parsedInts, which the caller must
 * ensure has enough capacity.
 * @param parsedInts array to receive parsed integers.  Caller
 * must ensure that parsedInts.length is >= the number of '#'
 * signs in 'pattern'.
 * @return the position after the last character parsed, or -1 if
 * the parse failed
 */
int32_t ICU_Utility::parsePattern(const UnicodeString& rule, int32_t pos, int32_t limit,
                              const UnicodeString& pattern, int32_t* parsedInts) {
    // TODO Update this to handle surrogates
    int32_t p;
    int32_t intCount = 0; // number of integers parsed
    for (int32_t i=0; i<pattern.length(); ++i) {
        UChar cpat = pattern.charAt(i);
        UChar c;
        switch (cpat) {
        case 32 /*' '*/:
            if (pos >= limit) {
                return -1;
            }
            c = rule.charAt(pos++);
            if (!PatternProps::isWhiteSpace(c)) {
                return -1;
            }
            // FALL THROUGH to skipWhitespace
            U_FALLTHROUGH;
        case 126 /*'~'*/:
            pos = skipWhitespace(rule, pos);
            break;
        case 35 /*'#'*/:
            p = pos;
            parsedInts[intCount++] = parseInteger(rule, p, limit);
            if (p == pos) {
                // Syntax error; failed to parse integer
                return -1;
            }
            pos = p;
            break;
        default:
            if (pos >= limit) {
                return -1;
            }
            c = (UChar) u_tolower(rule.charAt(pos++));
            if (c != cpat) {
                return -1;
            }
            break;
        }
    }
    return pos;
}
Esempio n. 14
0
/*
Function:
ChangeCase

Performs upper or lower casing of a string into a new buffer.
No special casing is performed beyond that provided by ICU.
*/
extern "C" void ChangeCase(const UChar* lpSrc,
                           int32_t cwSrcLength,
                           UChar* lpDst,
                           int32_t cwDstLength,
                           int32_t bToUpper)
{
	// Iterate through the string, decoding the next one or two UTF-16 code units
	// into a codepoint and updating srcIdx to point to the next UTF-16 code unit 
	// to decode.  Then upper or lower case it, write dstCodepoint into lpDst at 
	// offset dstIdx, and update dstIdx.

	// (The loop here has been manually cloned for each of the four cases, rather
	// than having a single loop that internally branched based on bToUpper as the 
	// compiler wasn't doing that optimization, and it results in an ~15-20% perf
	// improvement on longer strings.)

	UBool isError = FALSE;
	int32_t srcIdx = 0, dstIdx = 0;
	UChar32 srcCodepoint, dstCodepoint;

	if (bToUpper)
	{
		while (srcIdx < cwSrcLength)
		{
			U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint);
			dstCodepoint = u_toupper(srcCodepoint);
			U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
			assert(isError == FALSE && srcIdx == dstIdx);
		}
	}
	else
	{
		while (srcIdx < cwSrcLength)
		{
			U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint);
			dstCodepoint = u_tolower(srcCodepoint);
			U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
			assert(isError == FALSE && srcIdx == dstIdx);
		}
	}
}
Esempio n. 15
0
/*
Function:
ChangeCaseInvariant

Performs upper or lower casing of a string into a new buffer.
Special casing is performed to ensure that invariant casing 
matches that of Windows in certain situations, e.g. Turkish i's.
*/
extern "C" void ChangeCaseInvariant(const UChar* lpSrc,
                                    int32_t cwSrcLength,
                                    UChar* lpDst,
                                    int32_t cwDstLength,
                                    int32_t bToUpper)
{
	// See algorithmic comment in ChangeCase.

	UBool isError = FALSE;
	int32_t srcIdx = 0, dstIdx = 0;
	UChar32 srcCodepoint, dstCodepoint;

	if (bToUpper)
	{
		while (srcIdx < cwSrcLength)
		{
			// On Windows with InvariantCulture, the LATIN SMALL LETTER DOTLESS I (U+0131)
			// capitalizes to itself, whereas with ICU it capitalizes to LATIN CAPITAL LETTER I (U+0049).
			// We special case it to match the Windows invariant behavior.
			U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint);
			dstCodepoint = ((srcCodepoint == (UChar32)0x0131) ? (UChar32)0x0131 : u_toupper(srcCodepoint));
			U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
			assert(isError == FALSE && srcIdx == dstIdx);
		}
	}
	else
	{
		while (srcIdx < cwSrcLength)
		{
			// On Windows with InvariantCulture, the LATIN CAPITAL LETTER I WITH DOT ABOVE (U+0130)
			// lower cases to itself, whereas with ICU it lower cases to LATIN SMALL LETTER I (U+0069).
			// We special case it to match the Windows invariant behavior.
			U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint);
			dstCodepoint = ((srcCodepoint == (UChar32)0x0130) ? (UChar32)0x0130 : u_tolower(srcCodepoint));
			U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
			assert(isError == FALSE && srcIdx == dstIdx);
		}
	}
}
Esempio n. 16
0
/*
Function:
ChangeCaseTurkish

Performs upper or lower casing of a string into a new buffer, performing special
casing for Turkish.
*/
extern "C" void ChangeCaseTurkish(const UChar* lpSrc,
								  int32_t cwSrcLength,
								  UChar* lpDst,
								  int32_t cwDstLength,
								  int32_t bToUpper)
{
	// See algorithmic comment in ChangeCase.

	UBool isError = FALSE;
	int32_t srcIdx = 0, dstIdx = 0;
	UChar32 srcCodepoint, dstCodepoint;

	if (bToUpper)
	{
		while (srcIdx < cwSrcLength)
		{
			// In turkish casing, LATIN SMALL LETTER I (U+0069) upper cases to LATIN
			// CAPITAL LETTER I WITH DOT ABOVE (U+0130).
			U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint);
			dstCodepoint = ((srcCodepoint == (UChar32)0x0069) ? (UChar32)0x0130 : u_toupper(srcCodepoint));
			U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
			assert(isError == FALSE && srcIdx == dstIdx);
		}
	}
	else
	{
		while (srcIdx < cwSrcLength)
		{
			// In turkish casing, LATIN CAPITAL LETTER I (U+0049) lower cases to
			// LATIN SMALL LETTER DOTLESS I (U+0131).
			U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint);
			dstCodepoint = ((srcCodepoint == (UChar32)0x0049) ? (UChar32)0x0131 : u_tolower(srcCodepoint));
			U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
			assert(isError == FALSE && srcIdx == dstIdx);
		}
	}
}
Esempio n. 17
0
      symbol_type operator()(const symbol_type& symbol) const
      {
	const std::string& word = static_cast<const std::string&>(symbol);
	icu::UnicodeString uword = icu::UnicodeString::fromUTF8(icu::StringPiece(word.data(), word.size()));
	
	std::string signature = "<unk";
	
	// signature for English, taken from Stanford parser's getSignature5
	int num_caps = 0;
	bool has_digit  = false;
	bool has_dash   = false;
	bool has_lower  = false;
	bool has_punct  = false;
	bool has_symbol = false;
	
	size_t length = 0;
	UChar32 ch0 = 0;
	UChar32 ch_1 = 0;
	UChar32 ch_2 = 0;
	
	icu::StringCharacterIterator iter(uword);
	for (iter.setToStart(); iter.hasNext(); ++ length) {
	  const UChar32 ch = iter.next32PostInc();
	  
	  // keep initial char...
	  if (ch0 == 0)
	    ch0 = ch;
	  
	  ch_2 = ch_1;
	  ch_1 = ch;
	  
	  const int32_t gc = u_getIntPropertyValue(ch, UCHAR_GENERAL_CATEGORY_MASK);
	  
	  has_dash   |= ((gc & U_GC_PD_MASK) != 0);
	  has_punct  |= ((gc & U_GC_P_MASK) != 0);
	  has_symbol |= ((gc & U_GC_S_MASK) != 0);
	  
	  has_digit  |= (u_getNumericValue(ch) != U_NO_NUMERIC_VALUE);
	  
	  if (u_isUAlphabetic(ch)) {
	    if (u_isULowercase(ch))
	      has_lower = true;
	    else if (u_istitle(ch)) {
	      has_lower = true;
	      ++ num_caps;
	    } else
	      ++ num_caps;
	  }
	}
	
	// transform into lower...
	uword.toLower();
	ch_2 = (ch_2 ? u_tolower(ch_2) : ch_2);
	ch_1 = (ch_1 ? u_tolower(ch_1) : ch_1);
	
	// we do not check loc...
	if (u_isUUppercase(ch0) || u_istitle(ch0))
	  signature += "-caps";
	else if (! u_isUAlphabetic(ch0) && num_caps)
	  signature += "-caps";
	else if (has_lower)
	  signature += "-lc";
      
	if (has_digit)
	  signature += "-num";
	if (has_dash)
	  signature += "-dash";
	if (has_punct)
	  signature += "-punct";
	if (has_symbol)
	  signature += "-sym";
      
	if (length >= 3 && ch_1 == 's') {
	  if (ch_2 != 's' && ch_2 != 'i' && ch_2 != 'u')
	    signature += "-s";
	} else if (length >= 5 && ! has_dash && ! (has_digit && num_caps > 0)) {
	  if (uword.endsWith("ed"))
	    signature += "-ed";
	  else if (uword.endsWith("ing"))
	    signature += "-ing";
	  else if (uword.endsWith("ion"))
	    signature += "-ion";
	  else if (uword.endsWith("er"))
	    signature += "-er";
	  else if (uword.endsWith("est"))
	    signature += "-est";
	  else if (uword.endsWith("ly"))
	    signature += "-ly";
	  else if (uword.endsWith("ity"))
	    signature += "-ity";
	  else if (uword.endsWith("y"))
	    signature += "-y";
	  else if (uword.endsWith("al"))
	    signature += "-al";
	}
      
	signature += '>';
	
	return signature;
      }
Esempio n. 18
0
/**
 * Explores the given dictionary to match the given word.
 */
static void explore_dic(int offset,unichar* word,int pos_word,Dictionary* d,SpellCheckConfig* cfg,
		Ustring* output,SpellCheckHypothesis* *list,int base,Ustring* inflected) {
int original_offset=offset;
int original_base=base;
int final,n_transitions,inf_code;
int z=save_output(output);
int size_pairs=cfg->pairs->nbelems;
offset=read_dictionary_state(d,offset,&final,&n_transitions,&inf_code);
if (final) {
	if (word[pos_word]=='\0') {
		/* If we have a match */
		deal_with_matches(d,inflected->str,inf_code,output,cfg,base,list);
	}
	base=output->len;
}
/* If we are at the end of the token, then we stop */
if (word[pos_word]=='\0') {
	return;
}
unsigned int l2=inflected->len;
unichar c;
int dest_offset;
for (int i=0;i<n_transitions;i++) {
	restore_output(z,output);
	offset=read_dictionary_transition(d,offset,&c,&dest_offset,output);
	/* For backup_output, see comment below */
	int backup_output=save_output(output);
	if (c==word[pos_word] || word[pos_word]==u_toupper(c)) {
		u_strcat(inflected,c);
		explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
	} else {
		/* We deal with the SP_SWAP case, made of 2 SP_CHANGE_XXX */
		if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_SWAP!=cfg->max_SP_SWAP
				&& is_letter_swap(cfg,word,pos_word,inflected,c)) {
			/* We don't modify the number of errors since we override an existing
			 * SP_CHANGE_XXX one */
			cfg->current_SP_SWAP++;
			/* We override the previous change */
			int a=cfg->pairs->tab[cfg->pairs->nbelems-2];
			int b=cfg->pairs->tab[cfg->pairs->nbelems-1];
			cfg->pairs->tab[cfg->pairs->nbelems-2]=pos_word-1;
			cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_SWAP_DEFAULT;
			u_strcat(inflected,c);
			explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
			cfg->pairs->tab[cfg->pairs->nbelems-2]=a;
			cfg->pairs->tab[cfg->pairs->nbelems-1]=b;
			cfg->current_SP_SWAP--;
		} else /* We deal with the SP_CHANGE case */
		       if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_CHANGE!=cfg->max_SP_CHANGE
				/* We want letters, not spaces or anything else */
				&& is_letter(c,NULL)
		        /* We do not allow the replacement of a lowercase letter by an uppercase
		         * letter at the beginning of the word like Niserable, unless the whole word
		         * is in uppercase or the letter is the same, module the case */
		        && (cfg->allow_uppercase_initial || pos_word>0 || (!is_upper(word[0],NULL) || is_upper(word[1],NULL) || word[0]==u_toupper(c)))) {
			cfg->current_errors++;
			cfg->current_SP_CHANGE++;
			/* Now we test all possible kinds of change */
			vector_int_add(cfg->pairs,pos_word);
			u_strcat(inflected,c);
			/* We always add the default case */
			vector_int_add(cfg->pairs,SP_CHANGE_DEFAULT);
			int n_elem=cfg->pairs->nbelems;
			explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
			/* Then we test the accent case */
			if (u_deaccentuate(c)==u_deaccentuate(word[pos_word])) {
				/* After a call to explore_dic, we must restore the output.
				 * But, when dealing with SP_CHANGE_XXX ops, we must restore the
				 * output including the output associated to the current transition,
				 * which is why we don't use z (output before the current transition)
				 * but backup_output */
				restore_output(backup_output,output);
			    cfg->pairs->nbelems=n_elem;
			    cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_DIACRITIC;
			    explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
			}
			/* And the case variations */
			if (u_tolower(c)==u_tolower(word[pos_word])) {
			    restore_output(backup_output,output);
			    cfg->pairs->nbelems=n_elem;
				cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_CASE;
				explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
			}
			/* And finally the position on keyboard */
			if (areCloseOnKeyboard(c,word[pos_word],cfg->keyboard)) {
			    restore_output(backup_output,output);
			    cfg->pairs->nbelems=n_elem;
				cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_KEYBOARD;
				explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
			}
			cfg->pairs->nbelems=size_pairs;
			cfg->current_errors--;
			cfg->current_SP_CHANGE--;
			/* End of the SP_CHANGE case */
		}
	}
    restore_output(backup_output,output);
	truncate(inflected,l2);
	/* Now we deal with the SP_SUPPR case */
	if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_SUPPR!=cfg->max_SP_SUPPR
		/* We want letters, not spaces or anything else */
		&& is_letter(c,NULL)) {
		cfg->current_errors++;
		cfg->current_SP_SUPPR++;
		vector_int_add(cfg->pairs,pos_word);
		if (pos_word>=1 && c==word[pos_word-1]) {
			vector_int_add(cfg->pairs,SP_SUPPR_DOUBLE);
		} else {
			vector_int_add(cfg->pairs,SP_SUPPR_DEFAULT);
		}
		u_strcat(inflected,c);
		explore_dic(dest_offset,word,pos_word,d,cfg,output,list,original_base,inflected);
		truncate(inflected,l2);
		cfg->pairs->nbelems=size_pairs;
		cfg->current_errors--;
		cfg->current_SP_SUPPR--;
	}
}
restore_output(z,output);
/* Finally, we deal with the SP_INSERT case, by calling again the current
 * function with the same parameters, except pos_word that will be increased of 1 */
if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_INSERT!=cfg->max_SP_INSERT
	/* We want letters, not spaces or anything else */
	&& is_letter(word[pos_word],NULL)
	/* We do not allow the insertion of a capital letter at the beginning of
	 * the word like Astreet, unless the whole word is in uppercase like ASTREET */
    && (cfg->allow_uppercase_initial || pos_word>0 || (!is_upper(word[0],NULL) || is_upper(word[1],NULL)))) {
	cfg->current_errors++;
	cfg->current_SP_INSERT++;
	vector_int_add(cfg->pairs,pos_word);
	if (pos_word>=1 && word[pos_word]==word[pos_word-1]) {
		vector_int_add(cfg->pairs,SP_INSERT_DOUBLE);
	} else {
		vector_int_add(cfg->pairs,SP_INSERT_DEFAULT);
	}
	explore_dic(original_offset,word,pos_word+1,d,cfg,output,list,original_base,inflected);
	truncate(inflected,l2);
	cfg->pairs->nbelems=size_pairs;
	cfg->current_errors--;
	cfg->current_SP_INSERT--;
}
/* Finally, we restore the output as it was when we enter the function */
restore_output(z,output);
}
static jint Character_toLowerCaseImpl(JNIEnv*, jclass, jint codePoint) {
    return u_tolower(codePoint);
}
Esempio n. 20
0
//static jint Character_toLowerCaseImpl(JNIEnv*, jclass, jint codePoint) {
JNIEXPORT jint JNICALL
Java_java_lang_Character_toLowerCaseImpl(JNIEnv*, jclass, jint codePoint) {
    return u_tolower(codePoint);
}
Esempio n. 21
0
UErrorCode convsample_06()
{
  printf("\n\n==============================================\n"
         "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");

  FILE *f;
  int32_t count;
  char inBuf[BUFFERSIZE];
  const char *source;
  const char *sourceLimit;
  UChar *uBuf;
  int32_t uBufSize = 0;
  UConverter *conv;
  UErrorCode status = U_ZERO_ERROR;
  uint32_t letters=0, total=0;

  CharFreqInfo   *info;
  UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
  UChar32   p;

  uint32_t ie = 0;
  uint32_t gh = 0;
  UChar32 l = 0;

  f = fopen("data06.txt", "r");
  if(!f)
  {
    fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
    return U_FILE_ACCESS_ERROR;
  }

  info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
  if(!info)
  {
    fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
  }

  /* reset frequencies */
  for(p=0;p<charCount;p++)
  {
    info[p].codepoint = p;
    info[p].frequency = 0;
  }

  // **************************** START SAMPLE *******************
  conv = ucnv_open("utf-8", &status);
  assert(U_SUCCESS(status));

  uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
  printf("input bytes %d / min chars %d = %d UChars\n",
         BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
  uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
  assert(uBuf!=NULL);

  // grab another buffer's worth
  while((!feof(f)) && 
        ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
  {
    // Convert bytes to unicode
    source = inBuf;
    sourceLimit = inBuf + count;
    
    while(source < sourceLimit)
    {
      p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
      if(U_FAILURE(status))
      {
        fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
        status = U_ZERO_ERROR;
        continue;
      }
      U_ASSERT(status);
      total++;

      if(u_isalpha(p))
        letters++;

      if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
        ie++;

      if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
        gh++;

      if(p>charCount)
      {
        fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
        return U_UNSUPPORTED_ERROR;
      }
      info[p].frequency++;
      l = p;
    }
  }

  fclose(f);
  ucnv_close(conv);

  printf("%d letters out of %d total UChars.\n", letters, total);
  printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);

  // now, we could sort it..

  //  qsort(info, charCount, sizeof(info[0]), charfreq_compare);

  for(p=0;p<charCount;p++)
  {
    if(info[p].frequency)
    {
      printf("% 5d U+%06X ", info[p].frequency, p);
      if(p <= 0xFFFF)
      {
        prettyPrintUChar((UChar)p);
      }
      printf("\n");
    }
  }
  free(info);
  // ***************************** END SAMPLE ********************

  printf("\n");

  return U_ZERO_ERROR;
}
Esempio n. 22
0
static void demoCaseMapInC() {
    /*
     * input=
     *   "aB<capital sigma>"
     *   "iI<small dotless i><capital dotted I> "
     *   "<sharp s> <small lig. ffi>"
     *   "<small final sigma><small sigma><capital sigma>"
     */
    static const UChar input[]={
        0x61, 0x42, 0x3a3,
        0x69, 0x49, 0x131, 0x130, 0x20,
        0xdf, 0x20, 0xfb03,
        0x3c2, 0x3c3, 0x3a3, 0
    };
    UChar buffer[32];

    UErrorCode errorCode;
    UChar32 c;
    int32_t i, j, length;
    UBool isError;

    printf("\n* demoCaseMapInC() ----------------- ***\n\n");

    /*
     * First, use simple case mapping functions which provide
     * 1:1 code point mappings without context/locale ID.
     *
     * Note that some mappings will not be "right" because some "real"
     * case mappings require context, depend on the locale ID,
     * and/or result in a change in the number of code points.
     */
    printUString("input string: ", input, -1);

    /* uppercase */
    isError=FALSE;
    for(i=j=0; j<UPRV_LENGTHOF(buffer) && !isError; /* U16_NEXT post-increments */) {
        U16_NEXT(input, i, INT32_MAX, c); /* without length because NUL-terminated */
        if(c==0) {
            break; /* stop at terminating NUL, no need to terminate buffer */
        }
        c=u_toupper(c);
        U16_APPEND(buffer, j, UPRV_LENGTHOF(buffer), c, isError);
    }
    printUString("simple-uppercased: ", buffer, j);
    /* lowercase */
    isError=FALSE;
    for(i=j=0; j<UPRV_LENGTHOF(buffer) && !isError; /* U16_NEXT post-increments */) {
        U16_NEXT(input, i, INT32_MAX, c); /* without length because NUL-terminated */
        if(c==0) {
            break; /* stop at terminating NUL, no need to terminate buffer */
        }
        c=u_tolower(c);
        U16_APPEND(buffer, j, UPRV_LENGTHOF(buffer), c, isError);
    }
    printUString("simple-lowercased: ", buffer, j);
    /* titlecase */
    isError=FALSE;
    for(i=j=0; j<UPRV_LENGTHOF(buffer) && !isError; /* U16_NEXT post-increments */) {
        U16_NEXT(input, i, INT32_MAX, c); /* without length because NUL-terminated */
        if(c==0) {
            break; /* stop at terminating NUL, no need to terminate buffer */
        }
        c=u_totitle(c);
        U16_APPEND(buffer, j, UPRV_LENGTHOF(buffer), c, isError);
    }
    printUString("simple-titlecased: ", buffer, j);
    /* case-fold/default */
    isError=FALSE;
    for(i=j=0; j<UPRV_LENGTHOF(buffer) && !isError; /* U16_NEXT post-increments */) {
        U16_NEXT(input, i, INT32_MAX, c); /* without length because NUL-terminated */
        if(c==0) {
            break; /* stop at terminating NUL, no need to terminate buffer */
        }
        c=u_foldCase(c, U_FOLD_CASE_DEFAULT);
        U16_APPEND(buffer, j, UPRV_LENGTHOF(buffer), c, isError);
    }
    printUString("simple-case-folded/default: ", buffer, j);
    /* case-fold/Turkic */
    isError=FALSE;
    for(i=j=0; j<UPRV_LENGTHOF(buffer) && !isError; /* U16_NEXT post-increments */) {
        U16_NEXT(input, i, INT32_MAX, c); /* without length because NUL-terminated */
        if(c==0) {
            break; /* stop at terminating NUL, no need to terminate buffer */
        }
        c=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I);
        U16_APPEND(buffer, j, UPRV_LENGTHOF(buffer), c, isError);
    }
    printUString("simple-case-folded/Turkic: ", buffer, j);

    /*
     * Second, use full case mapping functions which provide
     * 1:n code point mappings (n can be 0!) and are sensitive to context and locale ID.
     *
     * Note that lower/upper/titlecasing take a locale ID while case-folding
     * has bit flag options instead, by design of the Unicode SpecialCasing.txt UCD file.
     *
     * Also, string titlecasing requires a BreakIterator to find starts of words.
     * The sample code here passes in a NULL pointer; u_strToTitle() will open and close a default
     * titlecasing BreakIterator automatically.
     * For production code where many strings are titlecased it would be more efficient
     * to open a BreakIterator externally and pass it in.
     */
    printUString("\ninput string: ", input, -1);

    /* lowercase/English */
    errorCode=U_ZERO_ERROR;
    length=u_strToLower(buffer, UPRV_LENGTHOF(buffer), input, -1, "en", &errorCode);
    if(U_SUCCESS(errorCode)) {
        printUString("full-lowercased/en: ", buffer, length);
    } else {
        printf("error in u_strToLower(en)=%ld error=%s\n", length, u_errorName(errorCode));
    }
    /* lowercase/Turkish */
    errorCode=U_ZERO_ERROR;
    length=u_strToLower(buffer, UPRV_LENGTHOF(buffer), input, -1, "tr", &errorCode);
    if(U_SUCCESS(errorCode)) {
        printUString("full-lowercased/tr: ", buffer, length);
    } else {
        printf("error in u_strToLower(tr)=%ld error=%s\n", length, u_errorName(errorCode));
    }
    /* uppercase/English */
    errorCode=U_ZERO_ERROR;
    length=u_strToUpper(buffer, UPRV_LENGTHOF(buffer), input, -1, "en", &errorCode);
    if(U_SUCCESS(errorCode)) {
        printUString("full-uppercased/en: ", buffer, length);
    } else {
        printf("error in u_strToUpper(en)=%ld error=%s\n", length, u_errorName(errorCode));
    }
    /* uppercase/Turkish */
    errorCode=U_ZERO_ERROR;
    length=u_strToUpper(buffer, UPRV_LENGTHOF(buffer), input, -1, "tr", &errorCode);
    if(U_SUCCESS(errorCode)) {
        printUString("full-uppercased/tr: ", buffer, length);
    } else {
        printf("error in u_strToUpper(tr)=%ld error=%s\n", length, u_errorName(errorCode));
    }
    /* titlecase/English */
    errorCode=U_ZERO_ERROR;
    length=u_strToTitle(buffer, UPRV_LENGTHOF(buffer), input, -1, NULL, "en", &errorCode);
    if(U_SUCCESS(errorCode)) {
        printUString("full-titlecased/en: ", buffer, length);
    } else {
        printf("error in u_strToTitle(en)=%ld error=%s\n", length, u_errorName(errorCode));
    }
    /* titlecase/Turkish */
    errorCode=U_ZERO_ERROR;
    length=u_strToTitle(buffer, UPRV_LENGTHOF(buffer), input, -1, NULL, "tr", &errorCode);
    if(U_SUCCESS(errorCode)) {
        printUString("full-titlecased/tr: ", buffer, length);
    } else {
        printf("error in u_strToTitle(tr)=%ld error=%s\n", length, u_errorName(errorCode));
    }
    /* case-fold/default */
    errorCode=U_ZERO_ERROR;
    length=u_strFoldCase(buffer, UPRV_LENGTHOF(buffer), input, -1, U_FOLD_CASE_DEFAULT, &errorCode);
    if(U_SUCCESS(errorCode)) {
        printUString("full-case-folded/default: ", buffer, length);
    } else {
        printf("error in u_strFoldCase(default)=%ld error=%s\n", length, u_errorName(errorCode));
    }
    /* case-fold/Turkic */
    errorCode=U_ZERO_ERROR;
    length=u_strFoldCase(buffer, UPRV_LENGTHOF(buffer), input, -1, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode);
    if(U_SUCCESS(errorCode)) {
        printUString("full-case-folded/Turkic: ", buffer, length);
    } else {
        printf("error in u_strFoldCase(Turkic)=%ld error=%s\n", length, u_errorName(errorCode));
    }
}
Esempio n. 23
0
jint fastiva_vm_Character_C$__toLowerCaseImpl(jint codePoint) {
    return u_tolower(codePoint);
}
Esempio n. 24
0
//
// this function explores the dictionary to decompose the word mot
//
void explore_state_german(int adresse,unichar* current_component,int pos_in_current_component,
                   const unichar* original_word,int pos_in_original_word,const unichar* decomposition,
                   unichar* dela_line,struct german_word_decomposition_list** L,int n_decomp,
                   const char* left,const char* right,
                   const struct INF_codes* inf_codes,const Alphabet* alphabet,
                   const unsigned char* tableau_bin) {
int c;
int index,t;
c=tableau_bin[adresse]*256+tableau_bin[adresse+1];
if (!(c&32768)) {
  // if we are in a terminal state
  index=tableau_bin[adresse+2]*256*256+tableau_bin[adresse+3]*256+tableau_bin[adresse+4];
  current_component[pos_in_current_component]='\0';
  if (pos_in_current_component>1) {
    // we don't consider words with a length of 1
    if (original_word[pos_in_original_word]=='\0') {
      // if we have explored the entire original word
      if (right[index]) {
         // and if we have a valid right component
         struct list_ustring* l=inf_codes->codes[index];
         while (l!=NULL) {
            unichar dec[500];
            u_strcpy(dec,decomposition);
            if (dec[0]!='\0') {u_strcat(dec," +++ ");}
            unichar entry[500];
            uncompress_entry(current_component,l->string,entry);
            u_strcat(dec,entry);
            unichar new_dela_line[500];
            struct dela_entry* tmp_entry=tokenize_DELAF_line(entry,1);
            if (tmp_entry==NULL) {
               /* If there was an error in the dictionary, we skip the entry */
               l=l->next;
               continue;
            }
            // change case if there is a prefix
            // prefixes are downcase, nouns (=suffixes) uppercase:
            // "investitionsObjekte" -> "Investitionsobjekte"
            if ( u_strlen(dela_line) != 0 ) {
              // capitalize dela_line
              dela_line[0] = u_toupper((unichar) dela_line[0]);
              // downcase lemma and inflected
              tmp_entry->inflected[0] = u_tolower(tmp_entry->inflected[0]);
              tmp_entry->lemma[0] = u_tolower(tmp_entry->lemma[0]);
            }
            u_strcpy(new_dela_line,dela_line);
            u_strcat(new_dela_line,tmp_entry->inflected);
            u_strcat(new_dela_line,",");
            u_strcat(new_dela_line,dela_line);
            u_strcat(new_dela_line,tmp_entry->lemma);
            u_strcat(new_dela_line,".");
            u_strcat(new_dela_line,tmp_entry->semantic_codes[0]);
            int k;
            for (k=1;k<tmp_entry->n_semantic_codes;k++) {
               u_strcat(new_dela_line,"+");
               u_strcat(new_dela_line,tmp_entry->semantic_codes[k]);
            }
            for (k=0;k<tmp_entry->n_inflectional_codes;k++) {
               u_strcat(new_dela_line,":");
               u_strcat(new_dela_line,tmp_entry->inflectional_codes[k]);
            }
            free_dela_entry(tmp_entry);
            struct german_word_decomposition* wd=new_german_word_decomposition();
            wd->n_parts=n_decomp;
            u_strcpy(wd->decomposition,dec);
            u_strcpy(wd->dela_line,new_dela_line);
            if (check_valid_right_component_for_one_INF_code_german(l->string)) {
               // if we got a correct right component (N-FF)
               struct german_word_decomposition_list* wdl=new_german_word_decomposition_list();
               wdl->element=wd;
               wdl->suivant=(*L);
               (*L)=wdl;
            } else {
               free_german_word_decomposition(wd);
            }
            l=l->next;
         }
      }
    }
    else {
      // else, we must explore the rest of the original word
      if (left[index]) {
         // but only if the current component was a valid left one
         // we go on with the next component
         unichar dec[2000];
         unichar line[500];
         u_strcpy(dec,decomposition);
         if (dec[0]!='\0') {u_strcat(dec," +++ ");}
         unichar sia_code[500];
         unichar entry[500];
         get_first_sia_code_german(index,sia_code,inf_codes);
         uncompress_entry(current_component,sia_code,entry);
         u_strcat(dec,entry);
         u_strcpy(line,dela_line);
         u_strcat(line,current_component);
         unichar temp[500];
         explore_state_german(4,temp,0,original_word,pos_in_original_word,
                  dec,line,L,n_decomp+1,left,right,inf_codes,alphabet,tableau_bin);
      }
    }
  }
  t=adresse+5;
}
else {
  c=c-32768;
  t=adresse+2;
}
if (original_word[pos_in_original_word]=='\0') {
   // if we have finished, we return
   return;
}
// if not, we go on with the next letter
for (int i=0;i<c;i++) {
  if (is_equal_or_uppercase((unichar)(tableau_bin[t]*256+tableau_bin[t+1]),original_word[pos_in_original_word],alphabet)
      || is_equal_or_uppercase(original_word[pos_in_original_word],(unichar)(tableau_bin[t]*256+tableau_bin[t+1]),alphabet)) {
    index=tableau_bin[t+2]*256*256+tableau_bin[t+3]*256+tableau_bin[t+4];
    current_component[pos_in_current_component]=(unichar)(tableau_bin[t]*256+tableau_bin[t+1]);
    explore_state_german(index,current_component,pos_in_current_component+1,original_word,pos_in_original_word+1,
                  decomposition,dela_line,L,n_decomp,left,right,inf_codes,alphabet,tableau_bin);
  }
  t=t+5;
}
}
Esempio n. 25
0
uint32
BUnicodeChar::ToLower(uint32 c)
{
	BUnicodeChar();
	return u_tolower(c);
}
// Helper sets the character attribute properties and sets up the script table.
// Does not set tops and bottoms.
void SetupBasicProperties(bool report_errors, bool decompose,
                          UNICHARSET* unicharset) {
  for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {
    // Convert any custom ligatures.
    const char* unichar_str = unicharset->id_to_unichar(unichar_id);
    for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) {
      if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {
        unichar_str = UNICHARSET::kCustomLigatures[i][0];
        break;
      }
    }

    // Convert the unichar to UTF32 representation
    std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(unichar_str);

    // Assume that if the property is true for any character in the string,
    // then it holds for the whole "character".
    bool unichar_isalpha = false;
    bool unichar_islower = false;
    bool unichar_isupper = false;
    bool unichar_isdigit = false;
    bool unichar_ispunct = false;

    for (char32 u_ch : uni_vector) {
      if (u_isalpha(u_ch)) unichar_isalpha = true;
      if (u_islower(u_ch)) unichar_islower = true;
      if (u_isupper(u_ch)) unichar_isupper = true;
      if (u_isdigit(u_ch)) unichar_isdigit = true;
      if (u_ispunct(u_ch)) unichar_ispunct = true;
    }

    unicharset->set_isalpha(unichar_id, unichar_isalpha);
    unicharset->set_islower(unichar_id, unichar_islower);
    unicharset->set_isupper(unichar_id, unichar_isupper);
    unicharset->set_isdigit(unichar_id, unichar_isdigit);
    unicharset->set_ispunctuation(unichar_id, unichar_ispunct);

    tesseract::IcuErrorCode err;
    unicharset->set_script(unichar_id, uscript_getName(
        uscript_getScript(uni_vector[0], err)));

    const int num_code_points = uni_vector.size();
    // Obtain the lower/upper case if needed and record it in the properties.
    unicharset->set_other_case(unichar_id, unichar_id);
    if (unichar_islower || unichar_isupper) {
      std::vector<char32> other_case(num_code_points, 0);
      for (int i = 0; i < num_code_points; ++i) {
        // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.
        // However since they deal with UChars (so need a conversion function
        // from char32 or UTF8string) and require a meaningful locale string,
        // for now u_tolower()/u_toupper() are used.
        other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :
          u_tolower(uni_vector[i]);
      }
      std::string other_case_uch = UNICHAR::UTF32ToUTF8(other_case);
      UNICHAR_ID other_case_id =
          unicharset->unichar_to_id(other_case_uch.c_str());
      if (other_case_id != INVALID_UNICHAR_ID) {
        unicharset->set_other_case(unichar_id, other_case_id);
      } else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) {
        tprintf("Other case %s of %s is not in unicharset\n",
                other_case_uch.c_str(), unichar_str);
      }
    }

    // Set RTL property and obtain mirror unichar ID from ICU.
    std::vector<char32> mirrors(num_code_points, 0);
    for (int i = 0; i < num_code_points; ++i) {
      mirrors[i] = u_charMirror(uni_vector[i]);
      if (i == 0) {  // set directionality to that of the 1st code point
        unicharset->set_direction(unichar_id,
                                  static_cast<UNICHARSET::Direction>(
                                      u_charDirection(uni_vector[i])));
      }
    }
    std::string mirror_uch = UNICHAR::UTF32ToUTF8(mirrors);
    UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());
    if (mirror_uch_id != INVALID_UNICHAR_ID) {
      unicharset->set_mirror(unichar_id, mirror_uch_id);
    } else if (report_errors) {
      tprintf("Mirror %s of %s is not in unicharset\n",
              mirror_uch.c_str(), unichar_str);
    }

    // Record normalized version of this unichar.
    std::string normed_str;
    if (unichar_id != 0 &&
        tesseract::NormalizeUTF8String(
            decompose ? tesseract::UnicodeNormMode::kNFKD
                      : tesseract::UnicodeNormMode::kNFKC,
            tesseract::OCRNorm::kNormalize, tesseract::GraphemeNorm::kNone,
            unichar_str, &normed_str) &&
        !normed_str.empty()) {
      unicharset->set_normed(unichar_id, normed_str.c_str());
    } else {
      unicharset->set_normed(unichar_id, unichar_str);
    }
    ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size());
  }
  unicharset->post_load_setup();
}
Esempio n. 27
0
// ---------------------------------------------------------------------------
//  RangeToken: Getter methods
// ---------------------------------------------------------------------------
RangeToken* RangeToken::getCaseInsensitiveToken(TokenFactory* const tokFactory) {

    if (fCaseIToken == 0 && tokFactory && fRanges) {

        bool isNRange = (getTokenType() == T_NRANGE) ? true : false;
        RangeToken* lwrToken = tokFactory->createRange(isNRange);

#if XERCES_USE_TRANSCODER_ICU && ((U_ICU_VERSION_MAJOR_NUM > 2) || (U_ICU_VERSION_MAJOR_NUM == 2 && U_ICU_VERSION_MINOR_NUM >=4))
        UChar* rangeStr=(UChar*)fMemoryManager->allocate(40*fElemCount*sizeof(UChar));
        ArrayJanitor<UChar> janRange(rangeStr, fMemoryManager);
        int c=0;
        rangeStr[c++] = chOpenSquare;
        for (unsigned int i = 0;  i < fElemCount - 1;  i += 2) {
            XMLCh buffer[10];
            XMLSize_t len, j;

            rangeStr[c++] = chBackSlash;
            rangeStr[c++] = chLatin_U;
            XMLString::binToText(fRanges[i], buffer, 10, 16, fMemoryManager);
            len = XMLString::stringLen(buffer);
            for(j=0;j<(8-len);j++)
                rangeStr[c++] = chDigit_0;
            XMLCh* p=buffer;
            while(*p)
                rangeStr[c++] = *p++;
            if(fRanges[i+1]!=fRanges[i])
            {
                rangeStr[c++] = chDash;
                rangeStr[c++] = chBackSlash;
                rangeStr[c++] = chLatin_U;
                XMLString::binToText(fRanges[i+1], buffer, 10, 16, fMemoryManager);
                len = XMLString::stringLen(buffer);
                for(j=0;j<(8-len);j++)
                    rangeStr[c++] = chDigit_0;
                p=buffer;
                while(*p)
                    rangeStr[c++] = *p++;
            }
        }
        rangeStr[c++] = chCloseSquare;
        rangeStr[c++] = chNull;
        UErrorCode ec=U_ZERO_ERROR;
        USet* range=uset_openPatternOptions(rangeStr, -1, USET_CASE_INSENSITIVE, &ec);
        if(range)
        {
            ec = U_ZERO_ERROR;
            uint32_t cbCount=uset_serialize(range, NULL, 0, &ec);
            uint16_t* buffer=(uint16_t*)fMemoryManager->allocate(cbCount*sizeof(uint16_t));
            ArrayJanitor<uint16_t> janSet(buffer, fMemoryManager);
            ec = U_ZERO_ERROR;
            uset_serialize(range, buffer, cbCount, &ec);
            USerializedSet serializedSet;
            uset_getSerializedSet(&serializedSet, buffer, cbCount);
            int32_t nSets=uset_getSerializedRangeCount(&serializedSet);
            for(int32_t i=0; i<nSets; i++)
            {
                UChar32 start, end;
                uset_getSerializedRange(&serializedSet, i, &start, &end);
                lwrToken->addRange(start, end);
            }
            // does this release the memory allocated by the set?
            uset_setSerializedToOne(&serializedSet, 32);
            uset_close(range);
        }
#else
        unsigned int exceptIndex = 0;

        for (unsigned int i = 0;  i < fElemCount - 1;  i += 2) {
            for (XMLInt32 ch = fRanges[i];  ch <= fRanges[i + 1];  ++ch) {
#if XERCES_USE_TRANSCODER_ICU
                const XMLInt32  upperCh = u_toupper(ch);

                if (upperCh != ch)
                {
                    lwrToken->addRange(upperCh, upperCh);
                }

                const XMLInt32  lowerCh = u_tolower(ch);

                if (lowerCh != ch)
                {
                    lwrToken->addRange(lowerCh, lowerCh);
                }

                const XMLInt32  titleCh = u_totitle(ch);

                if (titleCh != ch && titleCh != upperCh)
                {
                    lwrToken->addRange(titleCh, titleCh);
                }
#else
                if (ch >= chLatin_A && ch <= chLatin_Z)
                {
                    ch += chLatin_a - chLatin_A;

                    lwrToken->addRange(ch, ch);
                }
                else if (ch >= chLatin_a && ch <= chLatin_z)
                {
                    ch -= chLatin_a - chLatin_A;

                    lwrToken->addRange(ch, ch);
                }
#endif

                const unsigned int  exceptionsSize =
                    sizeof(s_exceptions) / sizeof(s_exceptions[0]);

                // Add any exception chars.  These are characters where the the
                // case mapping is not symmetric.  (Unicode case mappings are not isomorphic...)
                while (exceptIndex < exceptionsSize)
                {
                    if (s_exceptions[exceptIndex].baseChar < ch)
                    {
                        ++exceptIndex;
                    }
                    else if (s_exceptions[exceptIndex].baseChar == ch)
                    {
                        const XMLInt32  matchingChar =
                            s_exceptions[exceptIndex].matchingChar;

                        lwrToken->addRange(
                            matchingChar,
                            matchingChar);

                        ++exceptIndex;
                    }
                    else
                    {
                        break;
                    }
                }
            }
        }

        lwrToken->mergeRanges(this);
#endif
        lwrToken->compactRanges();
        lwrToken->createMap();

        fCaseIToken = lwrToken;
        // TODO(dbertoni) This is a temporary hack until we can change the ABI.
        // See Jira issue XERCESC-1866 for more details.
        // Overload the fCaseIToken data member to be the case-insensitive token
        // that's caching the case-insensitive one.  We need this because tokens
        // have varying lifetimes.
        fCaseIToken->setCaseInsensitiveToken(this);
    }

    return fCaseIToken;
}