int ICUUnicodeSupport::_compareNoCase<2>(ConstStringHolder<2> _first, ConstStringHolder<2> _second) { int32_t len1 = _first.length(); int32_t len2 = _second.length(); int32_t ofs1 = 0; int32_t ofs2 = 0; int r = checkStringEnd(ofs1, len1, ofs2, len2); if(r != 2) return r; const uint16_t* buf1 = _first.c_str(); const uint16_t* buf2 = _second.c_str(); while(true) { UChar32 c1, c2; U16_NEXT(buf1, ofs1, len1, c1); U16_NEXT(buf2, ofs2, len2, c2); c1 = u_tolower(c1); c2 = u_tolower(c2); if(c1 != c2) return (c1 < c2) ? -1 : 1; r = checkStringEnd(ofs1, len1, ofs2, len2); if(r != 2) return r; } }
void lemmatize(struct dela_entry* e,struct string_hash_ptr* keywords,Alphabet* alphabet) { unichar* lower=u_strdup(e->inflected); u_tolower(lower); KeyWord* k_inflected=(KeyWord*)get_value(lower,keywords); free(lower); if (k_inflected==NULL) return; Ustring* tmp=new_Ustring(64); u_sprintf(tmp,"%S.%S",e->lemma,e->semantic_codes[0]); KeyWord* k_lemma=(KeyWord*)get_value(tmp->str,keywords); if (k_lemma==NULL) { k_lemma=new_KeyWord(0,tmp->str,NULL); k_lemma->lemmatized=LEMMATIZED_KEYWORD; get_value_index(tmp->str,keywords,INSERT_IF_NEEDED,k_lemma); } /* Now, we look for all the case compatible tokens, and we add * their weights to the new lemmatized element */ while (k_inflected!=NULL) { if (k_inflected->sequence!=NULL && is_equal_or_uppercase(e->inflected,k_inflected->sequence,alphabet)) { /* We have a match */ k_lemma->weight+=k_inflected->weight; k_inflected->lemmatized=1; } k_inflected=k_inflected->next; } free_Ustring(tmp); }
/** * Loads a compound word file, adding each word to the keywords. */ void load_compound_words(char* name,VersatileEncodingConfig* vec, struct string_hash_ptr* keywords) { U_FILE* f=u_fopen(vec,name,U_READ); if (f==NULL) return; Ustring* line=new_Ustring(256); Ustring* lower=new_Ustring(256); while (EOF!=readline(line,f)) { if (line->str[0]=='{') { /* We skip tags */ continue; } u_strcpy(lower,line->str); u_tolower(lower->str); int index=get_value_index(lower->str,keywords,INSERT_IF_NEEDED,NULL); if (index==-1) { fatal_error("Internal error in load_tokens_by_freq\n"); } KeyWord* value=(KeyWord*)keywords->value[index]; add_keyword(&value,line->str,1); keywords->value[index]=value; } free_Ustring(line); free_Ustring(lower); u_fclose(f); }
std::string tolower(const std::string& str) { return transform(str, [](uint32_t cp) { return u_tolower(static_cast<UChar32>(cp)); }); }
/** * Loads the initial keyword list from a tok_by_freq.txt file, * and turns all those tokens in a list whose primary key is the * lower case token: * The/20 THE/2 the/50 => the->(The/20 THE/2 the/50) */ struct string_hash_ptr* load_tokens_by_freq(char* name,VersatileEncodingConfig* vec) { U_FILE* f=u_fopen(vec,name,U_READ); if (f==NULL) return NULL; Ustring* line=new_Ustring(128); Ustring* lower=new_Ustring(128); struct string_hash_ptr* res=new_string_hash_ptr(1024); int val,pos; /* We skip the first line of the file, containing the number * of tokens */ if (EOF==readline(line,f)) { fatal_error("Invalid empty file %s\n",name); } while (EOF!=readline(line,f)) { if (1!=u_sscanf(line->str,"%d%n",&val,&pos)) { fatal_error("Invalid line in file %s:\n%S\n",name,line->str); } u_strcpy(lower,line->str+pos); u_tolower(lower->str); int index=get_value_index(lower->str,res,INSERT_IF_NEEDED,NULL); if (index==-1) { fatal_error("Internal error in load_tokens_by_freq\n"); } KeyWord* value=(KeyWord*)res->value[index]; res->value[index]=new_KeyWord(val,line->str+pos,value); } free_Ustring(line); free_Ustring(lower); u_fclose(f); return res; }
static PyObject* icu_swap_case(PyObject *self, PyObject *input) { PyObject *result = NULL; UErrorCode status = U_ZERO_ERROR; UChar *input_buf = NULL, *output_buf = NULL; UChar32 *buf = NULL; int32_t sz = 0, sz32 = 0, i = 0; input_buf = python_to_icu(input, &sz); if (input_buf == NULL) goto end; output_buf = (UChar*) calloc(3 * sz, sizeof(UChar)); buf = (UChar32*) calloc(2 * sz, sizeof(UChar32)); if (output_buf == NULL || buf == NULL) { PyErr_NoMemory(); goto end; } u_strToUTF32(buf, 2 * sz, &sz32, input_buf, sz, &status); for (i = 0; i < sz32; i++) { if (u_islower(buf[i])) buf[i] = u_toupper(buf[i]); else if (u_isupper(buf[i])) buf[i] = u_tolower(buf[i]); } u_strFromUTF32(output_buf, 3*sz, &sz, buf, sz32, &status); if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; } result = icu_to_python(output_buf, sz); end: if (input_buf != NULL) free(input_buf); if (output_buf != NULL) free(output_buf); if (buf != NULL) free(buf); return result; } // }}}
/** * Returns 2 if 'c' is considered as a lowercase letter * in the given alphabet, 0 otherwise. */ int is_lower(unichar c,const Alphabet* alphabet) { if (alphabet==NULL) { if (u_is_letter(c) == 0) return 0; return (c == u_tolower(c)) ? 2 : 0; } return IS_LOWER_MACRO(c,alphabet); }
// --------------------------------------------------------------------------- // RangeToken: Getter methods // --------------------------------------------------------------------------- RangeToken* RangeToken::getCaseInsensitiveToken(TokenFactory* const tokFactory) { if (fCaseIToken == 0 && tokFactory) { bool isNRange = (getTokenType() == T_NRANGE) ? true : false; RangeToken* lwrToken = tokFactory->createRange(isNRange); for (unsigned int i = 0; i < fElemCount - 1; i += 2) { for (XMLInt32 ch = fRanges[i]; ch <= fRanges[i + 1]; ++ch) { #if defined(XML_USE_ICU_TRANSCODER) || defined (XML_USE_UNICONV390_TRANSCODER) const XMLInt32 upperCh = u_toupper(ch); if (upperCh != ch) { lwrToken->addRange(upperCh, upperCh); } const XMLInt32 lowerCh = u_tolower(ch); if (lowerCh != ch) { lwrToken->addRange(lowerCh, lowerCh); } const XMLInt32 titleCh = u_totitle(ch); if (titleCh != ch && titleCh != upperCh) { lwrToken->addRange(titleCh, titleCh); } #else if (ch >= chLatin_A && ch <= chLatin_Z) { ch += chLatin_a - chLatin_A; lwrToken->addRange(ch, ch); } else if (ch >= chLatin_a && ch <= chLatin_z) { ch -= chLatin_a - chLatin_A; lwrToken->addRange(ch, ch); } #endif } } lwrToken->mergeRanges(this); lwrToken->compactRanges(); lwrToken->createMap(); fCaseIToken = lwrToken; } return fCaseIToken; }
UChar UChar::toLower() const { #if APPLE_CHANGES return static_cast<unsigned short>(u_tolower(uc)); #else // ### properly support unicode tolower if (uc >= 256 || islower(uc)) return *this; return (unsigned char)tolower(uc); #endif }
void remove_keyword(unichar* keyword,struct string_hash_ptr* keywords) { unichar* lower=u_strdup(keyword); u_tolower(lower); KeyWord* k=(KeyWord*)get_value(lower,keywords); free(lower); if (k==NULL) return; while (k!=NULL) { if (k->sequence!=NULL && !u_strcmp(keyword,k->sequence)) { free(k->sequence); k->sequence=NULL; return; } k=k->next; } }
inline static UBool startsWithPrefix(const UChar* src , int32_t srcLength){ UBool startsWithPrefix = TRUE; if(srcLength < ACE_PREFIX_LENGTH){ return FALSE; } for(int8_t i=0; i< ACE_PREFIX_LENGTH; i++){ if(u_tolower(src[i]) != ACE_PREFIX[i]){ startsWithPrefix = FALSE; } } return startsWithPrefix; }
void ICUUnicodeSupport::_toLowerCase<2>(StringHolder<2> _str) { if(!_str.empty()) { uint16_t* buf = &_str[0]; int32_t len = _str.length(); int32_t ofs = 0, ofs2 = 0; while(ofs != len) { UChar32 c; U16_NEXT(buf, ofs, len, c); c = u_tolower(c); U16_APPEND_UNSAFE( buf, ofs2, c); } } }
/** * Parse a pattern string starting at offset pos. Keywords are * matched case-insensitively. Spaces may be skipped and may be * optional or required. Integer values may be parsed, and if * they are, they will be returned in the given array. If * successful, the offset of the next non-space character is * returned. On failure, -1 is returned. * @param pattern must only contain lowercase characters, which * will match their uppercase equivalents as well. A space * character matches one or more required spaces. A '~' character * matches zero or more optional spaces. A '#' character matches * an integer and stores it in parsedInts, which the caller must * ensure has enough capacity. * @param parsedInts array to receive parsed integers. Caller * must ensure that parsedInts.length is >= the number of '#' * signs in 'pattern'. * @return the position after the last character parsed, or -1 if * the parse failed */ int32_t ICU_Utility::parsePattern(const UnicodeString& rule, int32_t pos, int32_t limit, const UnicodeString& pattern, int32_t* parsedInts) { // TODO Update this to handle surrogates int32_t p; int32_t intCount = 0; // number of integers parsed for (int32_t i=0; i<pattern.length(); ++i) { UChar cpat = pattern.charAt(i); UChar c; switch (cpat) { case 32 /*' '*/: if (pos >= limit) { return -1; } c = rule.charAt(pos++); if (!PatternProps::isWhiteSpace(c)) { return -1; } // FALL THROUGH to skipWhitespace U_FALLTHROUGH; case 126 /*'~'*/: pos = skipWhitespace(rule, pos); break; case 35 /*'#'*/: p = pos; parsedInts[intCount++] = parseInteger(rule, p, limit); if (p == pos) { // Syntax error; failed to parse integer return -1; } pos = p; break; default: if (pos >= limit) { return -1; } c = (UChar) u_tolower(rule.charAt(pos++)); if (c != cpat) { return -1; } break; } } return pos; }
/* Function: ChangeCase Performs upper or lower casing of a string into a new buffer. No special casing is performed beyond that provided by ICU. */ extern "C" void ChangeCase(const UChar* lpSrc, int32_t cwSrcLength, UChar* lpDst, int32_t cwDstLength, int32_t bToUpper) { // Iterate through the string, decoding the next one or two UTF-16 code units // into a codepoint and updating srcIdx to point to the next UTF-16 code unit // to decode. Then upper or lower case it, write dstCodepoint into lpDst at // offset dstIdx, and update dstIdx. // (The loop here has been manually cloned for each of the four cases, rather // than having a single loop that internally branched based on bToUpper as the // compiler wasn't doing that optimization, and it results in an ~15-20% perf // improvement on longer strings.) UBool isError = FALSE; int32_t srcIdx = 0, dstIdx = 0; UChar32 srcCodepoint, dstCodepoint; if (bToUpper) { while (srcIdx < cwSrcLength) { U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint); dstCodepoint = u_toupper(srcCodepoint); U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); assert(isError == FALSE && srcIdx == dstIdx); } } else { while (srcIdx < cwSrcLength) { U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint); dstCodepoint = u_tolower(srcCodepoint); U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); assert(isError == FALSE && srcIdx == dstIdx); } } }
/* Function: ChangeCaseInvariant Performs upper or lower casing of a string into a new buffer. Special casing is performed to ensure that invariant casing matches that of Windows in certain situations, e.g. Turkish i's. */ extern "C" void ChangeCaseInvariant(const UChar* lpSrc, int32_t cwSrcLength, UChar* lpDst, int32_t cwDstLength, int32_t bToUpper) { // See algorithmic comment in ChangeCase. UBool isError = FALSE; int32_t srcIdx = 0, dstIdx = 0; UChar32 srcCodepoint, dstCodepoint; if (bToUpper) { while (srcIdx < cwSrcLength) { // On Windows with InvariantCulture, the LATIN SMALL LETTER DOTLESS I (U+0131) // capitalizes to itself, whereas with ICU it capitalizes to LATIN CAPITAL LETTER I (U+0049). // We special case it to match the Windows invariant behavior. U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint); dstCodepoint = ((srcCodepoint == (UChar32)0x0131) ? (UChar32)0x0131 : u_toupper(srcCodepoint)); U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); assert(isError == FALSE && srcIdx == dstIdx); } } else { while (srcIdx < cwSrcLength) { // On Windows with InvariantCulture, the LATIN CAPITAL LETTER I WITH DOT ABOVE (U+0130) // lower cases to itself, whereas with ICU it lower cases to LATIN SMALL LETTER I (U+0069). // We special case it to match the Windows invariant behavior. U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint); dstCodepoint = ((srcCodepoint == (UChar32)0x0130) ? (UChar32)0x0130 : u_tolower(srcCodepoint)); U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); assert(isError == FALSE && srcIdx == dstIdx); } } }
/* Function: ChangeCaseTurkish Performs upper or lower casing of a string into a new buffer, performing special casing for Turkish. */ extern "C" void ChangeCaseTurkish(const UChar* lpSrc, int32_t cwSrcLength, UChar* lpDst, int32_t cwDstLength, int32_t bToUpper) { // See algorithmic comment in ChangeCase. UBool isError = FALSE; int32_t srcIdx = 0, dstIdx = 0; UChar32 srcCodepoint, dstCodepoint; if (bToUpper) { while (srcIdx < cwSrcLength) { // In turkish casing, LATIN SMALL LETTER I (U+0069) upper cases to LATIN // CAPITAL LETTER I WITH DOT ABOVE (U+0130). U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint); dstCodepoint = ((srcCodepoint == (UChar32)0x0069) ? (UChar32)0x0130 : u_toupper(srcCodepoint)); U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); assert(isError == FALSE && srcIdx == dstIdx); } } else { while (srcIdx < cwSrcLength) { // In turkish casing, LATIN CAPITAL LETTER I (U+0049) lower cases to // LATIN SMALL LETTER DOTLESS I (U+0131). U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint); dstCodepoint = ((srcCodepoint == (UChar32)0x0049) ? (UChar32)0x0131 : u_tolower(srcCodepoint)); U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); assert(isError == FALSE && srcIdx == dstIdx); } } }
symbol_type operator()(const symbol_type& symbol) const { const std::string& word = static_cast<const std::string&>(symbol); icu::UnicodeString uword = icu::UnicodeString::fromUTF8(icu::StringPiece(word.data(), word.size())); std::string signature = "<unk"; // signature for English, taken from Stanford parser's getSignature5 int num_caps = 0; bool has_digit = false; bool has_dash = false; bool has_lower = false; bool has_punct = false; bool has_symbol = false; size_t length = 0; UChar32 ch0 = 0; UChar32 ch_1 = 0; UChar32 ch_2 = 0; icu::StringCharacterIterator iter(uword); for (iter.setToStart(); iter.hasNext(); ++ length) { const UChar32 ch = iter.next32PostInc(); // keep initial char... if (ch0 == 0) ch0 = ch; ch_2 = ch_1; ch_1 = ch; const int32_t gc = u_getIntPropertyValue(ch, UCHAR_GENERAL_CATEGORY_MASK); has_dash |= ((gc & U_GC_PD_MASK) != 0); has_punct |= ((gc & U_GC_P_MASK) != 0); has_symbol |= ((gc & U_GC_S_MASK) != 0); has_digit |= (u_getNumericValue(ch) != U_NO_NUMERIC_VALUE); if (u_isUAlphabetic(ch)) { if (u_isULowercase(ch)) has_lower = true; else if (u_istitle(ch)) { has_lower = true; ++ num_caps; } else ++ num_caps; } } // transform into lower... uword.toLower(); ch_2 = (ch_2 ? u_tolower(ch_2) : ch_2); ch_1 = (ch_1 ? u_tolower(ch_1) : ch_1); // we do not check loc... if (u_isUUppercase(ch0) || u_istitle(ch0)) signature += "-caps"; else if (! u_isUAlphabetic(ch0) && num_caps) signature += "-caps"; else if (has_lower) signature += "-lc"; if (has_digit) signature += "-num"; if (has_dash) signature += "-dash"; if (has_punct) signature += "-punct"; if (has_symbol) signature += "-sym"; if (length >= 3 && ch_1 == 's') { if (ch_2 != 's' && ch_2 != 'i' && ch_2 != 'u') signature += "-s"; } else if (length >= 5 && ! has_dash && ! (has_digit && num_caps > 0)) { if (uword.endsWith("ed")) signature += "-ed"; else if (uword.endsWith("ing")) signature += "-ing"; else if (uword.endsWith("ion")) signature += "-ion"; else if (uword.endsWith("er")) signature += "-er"; else if (uword.endsWith("est")) signature += "-est"; else if (uword.endsWith("ly")) signature += "-ly"; else if (uword.endsWith("ity")) signature += "-ity"; else if (uword.endsWith("y")) signature += "-y"; else if (uword.endsWith("al")) signature += "-al"; } signature += '>'; return signature; }
/** * Explores the given dictionary to match the given word. */ static void explore_dic(int offset,unichar* word,int pos_word,Dictionary* d,SpellCheckConfig* cfg, Ustring* output,SpellCheckHypothesis* *list,int base,Ustring* inflected) { int original_offset=offset; int original_base=base; int final,n_transitions,inf_code; int z=save_output(output); int size_pairs=cfg->pairs->nbelems; offset=read_dictionary_state(d,offset,&final,&n_transitions,&inf_code); if (final) { if (word[pos_word]=='\0') { /* If we have a match */ deal_with_matches(d,inflected->str,inf_code,output,cfg,base,list); } base=output->len; } /* If we are at the end of the token, then we stop */ if (word[pos_word]=='\0') { return; } unsigned int l2=inflected->len; unichar c; int dest_offset; for (int i=0;i<n_transitions;i++) { restore_output(z,output); offset=read_dictionary_transition(d,offset,&c,&dest_offset,output); /* For backup_output, see comment below */ int backup_output=save_output(output); if (c==word[pos_word] || word[pos_word]==u_toupper(c)) { u_strcat(inflected,c); explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); } else { /* We deal with the SP_SWAP case, made of 2 SP_CHANGE_XXX */ if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_SWAP!=cfg->max_SP_SWAP && is_letter_swap(cfg,word,pos_word,inflected,c)) { /* We don't modify the number of errors since we override an existing * SP_CHANGE_XXX one */ cfg->current_SP_SWAP++; /* We override the previous change */ int a=cfg->pairs->tab[cfg->pairs->nbelems-2]; int b=cfg->pairs->tab[cfg->pairs->nbelems-1]; cfg->pairs->tab[cfg->pairs->nbelems-2]=pos_word-1; cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_SWAP_DEFAULT; u_strcat(inflected,c); explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); cfg->pairs->tab[cfg->pairs->nbelems-2]=a; cfg->pairs->tab[cfg->pairs->nbelems-1]=b; cfg->current_SP_SWAP--; } else /* We deal with the SP_CHANGE case */ if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_CHANGE!=cfg->max_SP_CHANGE /* We want letters, not spaces or anything else */ && is_letter(c,NULL) /* We do not allow the replacement of a lowercase letter by an uppercase * letter at the beginning of the word like Niserable, unless the whole word * is in uppercase or the letter is the same, module the case */ && (cfg->allow_uppercase_initial || pos_word>0 || (!is_upper(word[0],NULL) || is_upper(word[1],NULL) || word[0]==u_toupper(c)))) { cfg->current_errors++; cfg->current_SP_CHANGE++; /* Now we test all possible kinds of change */ vector_int_add(cfg->pairs,pos_word); u_strcat(inflected,c); /* We always add the default case */ vector_int_add(cfg->pairs,SP_CHANGE_DEFAULT); int n_elem=cfg->pairs->nbelems; explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); /* Then we test the accent case */ if (u_deaccentuate(c)==u_deaccentuate(word[pos_word])) { /* After a call to explore_dic, we must restore the output. * But, when dealing with SP_CHANGE_XXX ops, we must restore the * output including the output associated to the current transition, * which is why we don't use z (output before the current transition) * but backup_output */ restore_output(backup_output,output); cfg->pairs->nbelems=n_elem; cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_DIACRITIC; explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); } /* And the case variations */ if (u_tolower(c)==u_tolower(word[pos_word])) { restore_output(backup_output,output); cfg->pairs->nbelems=n_elem; cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_CASE; explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); } /* And finally the position on keyboard */ if (areCloseOnKeyboard(c,word[pos_word],cfg->keyboard)) { restore_output(backup_output,output); cfg->pairs->nbelems=n_elem; cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_KEYBOARD; explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); } cfg->pairs->nbelems=size_pairs; cfg->current_errors--; cfg->current_SP_CHANGE--; /* End of the SP_CHANGE case */ } } restore_output(backup_output,output); truncate(inflected,l2); /* Now we deal with the SP_SUPPR case */ if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_SUPPR!=cfg->max_SP_SUPPR /* We want letters, not spaces or anything else */ && is_letter(c,NULL)) { cfg->current_errors++; cfg->current_SP_SUPPR++; vector_int_add(cfg->pairs,pos_word); if (pos_word>=1 && c==word[pos_word-1]) { vector_int_add(cfg->pairs,SP_SUPPR_DOUBLE); } else { vector_int_add(cfg->pairs,SP_SUPPR_DEFAULT); } u_strcat(inflected,c); explore_dic(dest_offset,word,pos_word,d,cfg,output,list,original_base,inflected); truncate(inflected,l2); cfg->pairs->nbelems=size_pairs; cfg->current_errors--; cfg->current_SP_SUPPR--; } } restore_output(z,output); /* Finally, we deal with the SP_INSERT case, by calling again the current * function with the same parameters, except pos_word that will be increased of 1 */ if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_INSERT!=cfg->max_SP_INSERT /* We want letters, not spaces or anything else */ && is_letter(word[pos_word],NULL) /* We do not allow the insertion of a capital letter at the beginning of * the word like Astreet, unless the whole word is in uppercase like ASTREET */ && (cfg->allow_uppercase_initial || pos_word>0 || (!is_upper(word[0],NULL) || is_upper(word[1],NULL)))) { cfg->current_errors++; cfg->current_SP_INSERT++; vector_int_add(cfg->pairs,pos_word); if (pos_word>=1 && word[pos_word]==word[pos_word-1]) { vector_int_add(cfg->pairs,SP_INSERT_DOUBLE); } else { vector_int_add(cfg->pairs,SP_INSERT_DEFAULT); } explore_dic(original_offset,word,pos_word+1,d,cfg,output,list,original_base,inflected); truncate(inflected,l2); cfg->pairs->nbelems=size_pairs; cfg->current_errors--; cfg->current_SP_INSERT--; } /* Finally, we restore the output as it was when we enter the function */ restore_output(z,output); }
static jint Character_toLowerCaseImpl(JNIEnv*, jclass, jint codePoint) { return u_tolower(codePoint); }
//static jint Character_toLowerCaseImpl(JNIEnv*, jclass, jint codePoint) { JNIEXPORT jint JNICALL Java_java_lang_Character_toLowerCaseImpl(JNIEnv*, jclass, jint codePoint) { return u_tolower(codePoint); }
UErrorCode convsample_06() { printf("\n\n==============================================\n" "Sample 06: C: frequency distribution of letters in a UTF-8 document\n"); FILE *f; int32_t count; char inBuf[BUFFERSIZE]; const char *source; const char *sourceLimit; UChar *uBuf; int32_t uBufSize = 0; UConverter *conv; UErrorCode status = U_ZERO_ERROR; uint32_t letters=0, total=0; CharFreqInfo *info; UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */ UChar32 p; uint32_t ie = 0; uint32_t gh = 0; UChar32 l = 0; f = fopen("data06.txt", "r"); if(!f) { fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n"); return U_FILE_ACCESS_ERROR; } info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount); if(!info) { fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount); } /* reset frequencies */ for(p=0;p<charCount;p++) { info[p].codepoint = p; info[p].frequency = 0; } // **************************** START SAMPLE ******************* conv = ucnv_open("utf-8", &status); assert(U_SUCCESS(status)); uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); printf("input bytes %d / min chars %d = %d UChars\n", BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); assert(uBuf!=NULL); // grab another buffer's worth while((!feof(f)) && ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) { // Convert bytes to unicode source = inBuf; sourceLimit = inBuf + count; while(source < sourceLimit) { p = ucnv_getNextUChar(conv, &source, sourceLimit, &status); if(U_FAILURE(status)) { fprintf(stderr, "%s @ %d\n", u_errorName(status), total); status = U_ZERO_ERROR; continue; } U_ASSERT(status); total++; if(u_isalpha(p)) letters++; if((u_tolower(l) == 'i') && (u_tolower(p) == 'e')) ie++; if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127)) gh++; if(p>charCount) { fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p); return U_UNSUPPORTED_ERROR; } info[p].frequency++; l = p; } } fclose(f); ucnv_close(conv); printf("%d letters out of %d total UChars.\n", letters, total); printf("%d ie digraphs, %d gh digraphs.\n", ie, gh); // now, we could sort it.. // qsort(info, charCount, sizeof(info[0]), charfreq_compare); for(p=0;p<charCount;p++) { if(info[p].frequency) { printf("% 5d U+%06X ", info[p].frequency, p); if(p <= 0xFFFF) { prettyPrintUChar((UChar)p); } printf("\n"); } } free(info); // ***************************** END SAMPLE ******************** printf("\n"); return U_ZERO_ERROR; }
static void demoCaseMapInC() { /* * input= * "aB<capital sigma>" * "iI<small dotless i><capital dotted I> " * "<sharp s> <small lig. ffi>" * "<small final sigma><small sigma><capital sigma>" */ static const UChar input[]={ 0x61, 0x42, 0x3a3, 0x69, 0x49, 0x131, 0x130, 0x20, 0xdf, 0x20, 0xfb03, 0x3c2, 0x3c3, 0x3a3, 0 }; UChar buffer[32]; UErrorCode errorCode; UChar32 c; int32_t i, j, length; UBool isError; printf("\n* demoCaseMapInC() ----------------- ***\n\n"); /* * First, use simple case mapping functions which provide * 1:1 code point mappings without context/locale ID. * * Note that some mappings will not be "right" because some "real" * case mappings require context, depend on the locale ID, * and/or result in a change in the number of code points. */ printUString("input string: ", input, -1); /* uppercase */ isError=FALSE; for(i=j=0; j<UPRV_LENGTHOF(buffer) && !isError; /* U16_NEXT post-increments */) { U16_NEXT(input, i, INT32_MAX, c); /* without length because NUL-terminated */ if(c==0) { break; /* stop at terminating NUL, no need to terminate buffer */ } c=u_toupper(c); U16_APPEND(buffer, j, UPRV_LENGTHOF(buffer), c, isError); } printUString("simple-uppercased: ", buffer, j); /* lowercase */ isError=FALSE; for(i=j=0; j<UPRV_LENGTHOF(buffer) && !isError; /* U16_NEXT post-increments */) { U16_NEXT(input, i, INT32_MAX, c); /* without length because NUL-terminated */ if(c==0) { break; /* stop at terminating NUL, no need to terminate buffer */ } c=u_tolower(c); U16_APPEND(buffer, j, UPRV_LENGTHOF(buffer), c, isError); } printUString("simple-lowercased: ", buffer, j); /* titlecase */ isError=FALSE; for(i=j=0; j<UPRV_LENGTHOF(buffer) && !isError; /* U16_NEXT post-increments */) { U16_NEXT(input, i, INT32_MAX, c); /* without length because NUL-terminated */ if(c==0) { break; /* stop at terminating NUL, no need to terminate buffer */ } c=u_totitle(c); U16_APPEND(buffer, j, UPRV_LENGTHOF(buffer), c, isError); } printUString("simple-titlecased: ", buffer, j); /* case-fold/default */ isError=FALSE; for(i=j=0; j<UPRV_LENGTHOF(buffer) && !isError; /* U16_NEXT post-increments */) { U16_NEXT(input, i, INT32_MAX, c); /* without length because NUL-terminated */ if(c==0) { break; /* stop at terminating NUL, no need to terminate buffer */ } c=u_foldCase(c, U_FOLD_CASE_DEFAULT); U16_APPEND(buffer, j, UPRV_LENGTHOF(buffer), c, isError); } printUString("simple-case-folded/default: ", buffer, j); /* case-fold/Turkic */ isError=FALSE; for(i=j=0; j<UPRV_LENGTHOF(buffer) && !isError; /* U16_NEXT post-increments */) { U16_NEXT(input, i, INT32_MAX, c); /* without length because NUL-terminated */ if(c==0) { break; /* stop at terminating NUL, no need to terminate buffer */ } c=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I); U16_APPEND(buffer, j, UPRV_LENGTHOF(buffer), c, isError); } printUString("simple-case-folded/Turkic: ", buffer, j); /* * Second, use full case mapping functions which provide * 1:n code point mappings (n can be 0!) and are sensitive to context and locale ID. * * Note that lower/upper/titlecasing take a locale ID while case-folding * has bit flag options instead, by design of the Unicode SpecialCasing.txt UCD file. * * Also, string titlecasing requires a BreakIterator to find starts of words. * The sample code here passes in a NULL pointer; u_strToTitle() will open and close a default * titlecasing BreakIterator automatically. * For production code where many strings are titlecased it would be more efficient * to open a BreakIterator externally and pass it in. */ printUString("\ninput string: ", input, -1); /* lowercase/English */ errorCode=U_ZERO_ERROR; length=u_strToLower(buffer, UPRV_LENGTHOF(buffer), input, -1, "en", &errorCode); if(U_SUCCESS(errorCode)) { printUString("full-lowercased/en: ", buffer, length); } else { printf("error in u_strToLower(en)=%ld error=%s\n", length, u_errorName(errorCode)); } /* lowercase/Turkish */ errorCode=U_ZERO_ERROR; length=u_strToLower(buffer, UPRV_LENGTHOF(buffer), input, -1, "tr", &errorCode); if(U_SUCCESS(errorCode)) { printUString("full-lowercased/tr: ", buffer, length); } else { printf("error in u_strToLower(tr)=%ld error=%s\n", length, u_errorName(errorCode)); } /* uppercase/English */ errorCode=U_ZERO_ERROR; length=u_strToUpper(buffer, UPRV_LENGTHOF(buffer), input, -1, "en", &errorCode); if(U_SUCCESS(errorCode)) { printUString("full-uppercased/en: ", buffer, length); } else { printf("error in u_strToUpper(en)=%ld error=%s\n", length, u_errorName(errorCode)); } /* uppercase/Turkish */ errorCode=U_ZERO_ERROR; length=u_strToUpper(buffer, UPRV_LENGTHOF(buffer), input, -1, "tr", &errorCode); if(U_SUCCESS(errorCode)) { printUString("full-uppercased/tr: ", buffer, length); } else { printf("error in u_strToUpper(tr)=%ld error=%s\n", length, u_errorName(errorCode)); } /* titlecase/English */ errorCode=U_ZERO_ERROR; length=u_strToTitle(buffer, UPRV_LENGTHOF(buffer), input, -1, NULL, "en", &errorCode); if(U_SUCCESS(errorCode)) { printUString("full-titlecased/en: ", buffer, length); } else { printf("error in u_strToTitle(en)=%ld error=%s\n", length, u_errorName(errorCode)); } /* titlecase/Turkish */ errorCode=U_ZERO_ERROR; length=u_strToTitle(buffer, UPRV_LENGTHOF(buffer), input, -1, NULL, "tr", &errorCode); if(U_SUCCESS(errorCode)) { printUString("full-titlecased/tr: ", buffer, length); } else { printf("error in u_strToTitle(tr)=%ld error=%s\n", length, u_errorName(errorCode)); } /* case-fold/default */ errorCode=U_ZERO_ERROR; length=u_strFoldCase(buffer, UPRV_LENGTHOF(buffer), input, -1, U_FOLD_CASE_DEFAULT, &errorCode); if(U_SUCCESS(errorCode)) { printUString("full-case-folded/default: ", buffer, length); } else { printf("error in u_strFoldCase(default)=%ld error=%s\n", length, u_errorName(errorCode)); } /* case-fold/Turkic */ errorCode=U_ZERO_ERROR; length=u_strFoldCase(buffer, UPRV_LENGTHOF(buffer), input, -1, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode); if(U_SUCCESS(errorCode)) { printUString("full-case-folded/Turkic: ", buffer, length); } else { printf("error in u_strFoldCase(Turkic)=%ld error=%s\n", length, u_errorName(errorCode)); } }
jint fastiva_vm_Character_C$__toLowerCaseImpl(jint codePoint) { return u_tolower(codePoint); }
// // this function explores the dictionary to decompose the word mot // void explore_state_german(int adresse,unichar* current_component,int pos_in_current_component, const unichar* original_word,int pos_in_original_word,const unichar* decomposition, unichar* dela_line,struct german_word_decomposition_list** L,int n_decomp, const char* left,const char* right, const struct INF_codes* inf_codes,const Alphabet* alphabet, const unsigned char* tableau_bin) { int c; int index,t; c=tableau_bin[adresse]*256+tableau_bin[adresse+1]; if (!(c&32768)) { // if we are in a terminal state index=tableau_bin[adresse+2]*256*256+tableau_bin[adresse+3]*256+tableau_bin[adresse+4]; current_component[pos_in_current_component]='\0'; if (pos_in_current_component>1) { // we don't consider words with a length of 1 if (original_word[pos_in_original_word]=='\0') { // if we have explored the entire original word if (right[index]) { // and if we have a valid right component struct list_ustring* l=inf_codes->codes[index]; while (l!=NULL) { unichar dec[500]; u_strcpy(dec,decomposition); if (dec[0]!='\0') {u_strcat(dec," +++ ");} unichar entry[500]; uncompress_entry(current_component,l->string,entry); u_strcat(dec,entry); unichar new_dela_line[500]; struct dela_entry* tmp_entry=tokenize_DELAF_line(entry,1); if (tmp_entry==NULL) { /* If there was an error in the dictionary, we skip the entry */ l=l->next; continue; } // change case if there is a prefix // prefixes are downcase, nouns (=suffixes) uppercase: // "investitionsObjekte" -> "Investitionsobjekte" if ( u_strlen(dela_line) != 0 ) { // capitalize dela_line dela_line[0] = u_toupper((unichar) dela_line[0]); // downcase lemma and inflected tmp_entry->inflected[0] = u_tolower(tmp_entry->inflected[0]); tmp_entry->lemma[0] = u_tolower(tmp_entry->lemma[0]); } u_strcpy(new_dela_line,dela_line); u_strcat(new_dela_line,tmp_entry->inflected); u_strcat(new_dela_line,","); u_strcat(new_dela_line,dela_line); u_strcat(new_dela_line,tmp_entry->lemma); u_strcat(new_dela_line,"."); u_strcat(new_dela_line,tmp_entry->semantic_codes[0]); int k; for (k=1;k<tmp_entry->n_semantic_codes;k++) { u_strcat(new_dela_line,"+"); u_strcat(new_dela_line,tmp_entry->semantic_codes[k]); } for (k=0;k<tmp_entry->n_inflectional_codes;k++) { u_strcat(new_dela_line,":"); u_strcat(new_dela_line,tmp_entry->inflectional_codes[k]); } free_dela_entry(tmp_entry); struct german_word_decomposition* wd=new_german_word_decomposition(); wd->n_parts=n_decomp; u_strcpy(wd->decomposition,dec); u_strcpy(wd->dela_line,new_dela_line); if (check_valid_right_component_for_one_INF_code_german(l->string)) { // if we got a correct right component (N-FF) struct german_word_decomposition_list* wdl=new_german_word_decomposition_list(); wdl->element=wd; wdl->suivant=(*L); (*L)=wdl; } else { free_german_word_decomposition(wd); } l=l->next; } } } else { // else, we must explore the rest of the original word if (left[index]) { // but only if the current component was a valid left one // we go on with the next component unichar dec[2000]; unichar line[500]; u_strcpy(dec,decomposition); if (dec[0]!='\0') {u_strcat(dec," +++ ");} unichar sia_code[500]; unichar entry[500]; get_first_sia_code_german(index,sia_code,inf_codes); uncompress_entry(current_component,sia_code,entry); u_strcat(dec,entry); u_strcpy(line,dela_line); u_strcat(line,current_component); unichar temp[500]; explore_state_german(4,temp,0,original_word,pos_in_original_word, dec,line,L,n_decomp+1,left,right,inf_codes,alphabet,tableau_bin); } } } t=adresse+5; } else { c=c-32768; t=adresse+2; } if (original_word[pos_in_original_word]=='\0') { // if we have finished, we return return; } // if not, we go on with the next letter for (int i=0;i<c;i++) { if (is_equal_or_uppercase((unichar)(tableau_bin[t]*256+tableau_bin[t+1]),original_word[pos_in_original_word],alphabet) || is_equal_or_uppercase(original_word[pos_in_original_word],(unichar)(tableau_bin[t]*256+tableau_bin[t+1]),alphabet)) { index=tableau_bin[t+2]*256*256+tableau_bin[t+3]*256+tableau_bin[t+4]; current_component[pos_in_current_component]=(unichar)(tableau_bin[t]*256+tableau_bin[t+1]); explore_state_german(index,current_component,pos_in_current_component+1,original_word,pos_in_original_word+1, decomposition,dela_line,L,n_decomp,left,right,inf_codes,alphabet,tableau_bin); } t=t+5; } }
uint32 BUnicodeChar::ToLower(uint32 c) { BUnicodeChar(); return u_tolower(c); }
// Helper sets the character attribute properties and sets up the script table. // Does not set tops and bottoms. void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET* unicharset) { for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) { // Convert any custom ligatures. const char* unichar_str = unicharset->id_to_unichar(unichar_id); for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) { if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) { unichar_str = UNICHARSET::kCustomLigatures[i][0]; break; } } // Convert the unichar to UTF32 representation std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(unichar_str); // Assume that if the property is true for any character in the string, // then it holds for the whole "character". bool unichar_isalpha = false; bool unichar_islower = false; bool unichar_isupper = false; bool unichar_isdigit = false; bool unichar_ispunct = false; for (char32 u_ch : uni_vector) { if (u_isalpha(u_ch)) unichar_isalpha = true; if (u_islower(u_ch)) unichar_islower = true; if (u_isupper(u_ch)) unichar_isupper = true; if (u_isdigit(u_ch)) unichar_isdigit = true; if (u_ispunct(u_ch)) unichar_ispunct = true; } unicharset->set_isalpha(unichar_id, unichar_isalpha); unicharset->set_islower(unichar_id, unichar_islower); unicharset->set_isupper(unichar_id, unichar_isupper); unicharset->set_isdigit(unichar_id, unichar_isdigit); unicharset->set_ispunctuation(unichar_id, unichar_ispunct); tesseract::IcuErrorCode err; unicharset->set_script(unichar_id, uscript_getName( uscript_getScript(uni_vector[0], err))); const int num_code_points = uni_vector.size(); // Obtain the lower/upper case if needed and record it in the properties. unicharset->set_other_case(unichar_id, unichar_id); if (unichar_islower || unichar_isupper) { std::vector<char32> other_case(num_code_points, 0); for (int i = 0; i < num_code_points; ++i) { // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used. // However since they deal with UChars (so need a conversion function // from char32 or UTF8string) and require a meaningful locale string, // for now u_tolower()/u_toupper() are used. other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) : u_tolower(uni_vector[i]); } std::string other_case_uch = UNICHAR::UTF32ToUTF8(other_case); UNICHAR_ID other_case_id = unicharset->unichar_to_id(other_case_uch.c_str()); if (other_case_id != INVALID_UNICHAR_ID) { unicharset->set_other_case(unichar_id, other_case_id); } else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) { tprintf("Other case %s of %s is not in unicharset\n", other_case_uch.c_str(), unichar_str); } } // Set RTL property and obtain mirror unichar ID from ICU. std::vector<char32> mirrors(num_code_points, 0); for (int i = 0; i < num_code_points; ++i) { mirrors[i] = u_charMirror(uni_vector[i]); if (i == 0) { // set directionality to that of the 1st code point unicharset->set_direction(unichar_id, static_cast<UNICHARSET::Direction>( u_charDirection(uni_vector[i]))); } } std::string mirror_uch = UNICHAR::UTF32ToUTF8(mirrors); UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str()); if (mirror_uch_id != INVALID_UNICHAR_ID) { unicharset->set_mirror(unichar_id, mirror_uch_id); } else if (report_errors) { tprintf("Mirror %s of %s is not in unicharset\n", mirror_uch.c_str(), unichar_str); } // Record normalized version of this unichar. std::string normed_str; if (unichar_id != 0 && tesseract::NormalizeUTF8String( decompose ? tesseract::UnicodeNormMode::kNFKD : tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize, tesseract::GraphemeNorm::kNone, unichar_str, &normed_str) && !normed_str.empty()) { unicharset->set_normed(unichar_id, normed_str.c_str()); } else { unicharset->set_normed(unichar_id, unichar_str); } ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size()); } unicharset->post_load_setup(); }
// --------------------------------------------------------------------------- // RangeToken: Getter methods // --------------------------------------------------------------------------- RangeToken* RangeToken::getCaseInsensitiveToken(TokenFactory* const tokFactory) { if (fCaseIToken == 0 && tokFactory && fRanges) { bool isNRange = (getTokenType() == T_NRANGE) ? true : false; RangeToken* lwrToken = tokFactory->createRange(isNRange); #if XERCES_USE_TRANSCODER_ICU && ((U_ICU_VERSION_MAJOR_NUM > 2) || (U_ICU_VERSION_MAJOR_NUM == 2 && U_ICU_VERSION_MINOR_NUM >=4)) UChar* rangeStr=(UChar*)fMemoryManager->allocate(40*fElemCount*sizeof(UChar)); ArrayJanitor<UChar> janRange(rangeStr, fMemoryManager); int c=0; rangeStr[c++] = chOpenSquare; for (unsigned int i = 0; i < fElemCount - 1; i += 2) { XMLCh buffer[10]; XMLSize_t len, j; rangeStr[c++] = chBackSlash; rangeStr[c++] = chLatin_U; XMLString::binToText(fRanges[i], buffer, 10, 16, fMemoryManager); len = XMLString::stringLen(buffer); for(j=0;j<(8-len);j++) rangeStr[c++] = chDigit_0; XMLCh* p=buffer; while(*p) rangeStr[c++] = *p++; if(fRanges[i+1]!=fRanges[i]) { rangeStr[c++] = chDash; rangeStr[c++] = chBackSlash; rangeStr[c++] = chLatin_U; XMLString::binToText(fRanges[i+1], buffer, 10, 16, fMemoryManager); len = XMLString::stringLen(buffer); for(j=0;j<(8-len);j++) rangeStr[c++] = chDigit_0; p=buffer; while(*p) rangeStr[c++] = *p++; } } rangeStr[c++] = chCloseSquare; rangeStr[c++] = chNull; UErrorCode ec=U_ZERO_ERROR; USet* range=uset_openPatternOptions(rangeStr, -1, USET_CASE_INSENSITIVE, &ec); if(range) { ec = U_ZERO_ERROR; uint32_t cbCount=uset_serialize(range, NULL, 0, &ec); uint16_t* buffer=(uint16_t*)fMemoryManager->allocate(cbCount*sizeof(uint16_t)); ArrayJanitor<uint16_t> janSet(buffer, fMemoryManager); ec = U_ZERO_ERROR; uset_serialize(range, buffer, cbCount, &ec); USerializedSet serializedSet; uset_getSerializedSet(&serializedSet, buffer, cbCount); int32_t nSets=uset_getSerializedRangeCount(&serializedSet); for(int32_t i=0; i<nSets; i++) { UChar32 start, end; uset_getSerializedRange(&serializedSet, i, &start, &end); lwrToken->addRange(start, end); } // does this release the memory allocated by the set? uset_setSerializedToOne(&serializedSet, 32); uset_close(range); } #else unsigned int exceptIndex = 0; for (unsigned int i = 0; i < fElemCount - 1; i += 2) { for (XMLInt32 ch = fRanges[i]; ch <= fRanges[i + 1]; ++ch) { #if XERCES_USE_TRANSCODER_ICU const XMLInt32 upperCh = u_toupper(ch); if (upperCh != ch) { lwrToken->addRange(upperCh, upperCh); } const XMLInt32 lowerCh = u_tolower(ch); if (lowerCh != ch) { lwrToken->addRange(lowerCh, lowerCh); } const XMLInt32 titleCh = u_totitle(ch); if (titleCh != ch && titleCh != upperCh) { lwrToken->addRange(titleCh, titleCh); } #else if (ch >= chLatin_A && ch <= chLatin_Z) { ch += chLatin_a - chLatin_A; lwrToken->addRange(ch, ch); } else if (ch >= chLatin_a && ch <= chLatin_z) { ch -= chLatin_a - chLatin_A; lwrToken->addRange(ch, ch); } #endif const unsigned int exceptionsSize = sizeof(s_exceptions) / sizeof(s_exceptions[0]); // Add any exception chars. These are characters where the the // case mapping is not symmetric. (Unicode case mappings are not isomorphic...) while (exceptIndex < exceptionsSize) { if (s_exceptions[exceptIndex].baseChar < ch) { ++exceptIndex; } else if (s_exceptions[exceptIndex].baseChar == ch) { const XMLInt32 matchingChar = s_exceptions[exceptIndex].matchingChar; lwrToken->addRange( matchingChar, matchingChar); ++exceptIndex; } else { break; } } } } lwrToken->mergeRanges(this); #endif lwrToken->compactRanges(); lwrToken->createMap(); fCaseIToken = lwrToken; // TODO(dbertoni) This is a temporary hack until we can change the ABI. // See Jira issue XERCESC-1866 for more details. // Overload the fCaseIToken data member to be the case-insensitive token // that's caching the case-insensitive one. We need this because tokens // have varying lifetimes. fCaseIToken->setCaseInsensitiveToken(this); } return fCaseIToken; }