void FICUCamelCaseBreakIterator::TokenizeString(TArray<FToken>& OutTokens) { OutTokens.Empty(String.Len()); FICUTextCharacterIterator CharIter(String); for(CharIter.setToStart(); CharIter.current32() != FICUTextCharacterIterator::DONE; CharIter.next32PostInc()) { const UChar32 CurrentChar = CharIter.current32(); ETokenType TokenType = ETokenType::Other; if(u_isULowercase(CurrentChar)) { TokenType = ETokenType::Lowercase; } else if(u_isUUppercase(CurrentChar)) { TokenType = ETokenType::Uppercase; } else if(u_isdigit(CurrentChar)) { TokenType = ETokenType::Digit; } const int32 CharIndex = CharIter.InternalIndexToSourceIndex(CharIter.getIndex()); OutTokens.Emplace(FToken(TokenType, CharIndex)); } OutTokens.Emplace(FToken(ETokenType::Null, String.Len())); // There should always be at least one token for the end of the string check(OutTokens.Num()); }
U_CAPI void U_EXPORT2 u_init(UErrorCode *status) { UTRACE_ENTRY_OC(UTRACE_U_INIT); /* Make sure the global mutexes are initialized. */ umtx_init(NULL); umtx_lock(&gICUInitMutex); if (gICUInitialized || U_FAILURE(*status)) { umtx_unlock(&gICUInitMutex); UTRACE_EXIT_STATUS(*status); return; } #if 1 /* * 2005-may-02 * * ICU4C 3.4 (jitterbug 4497) hardcodes the data for Unicode character * properties for APIs that want to be fast. * Therefore, we need not load them here nor check for errors. * Instead, we load the converter alias table to see if any ICU data * is available. * Users should really open the service objects they need and check * for errors there, to make sure that the actual items they need are * available. */ #if !UCONFIG_NO_CONVERSION ucnv_io_countKnownConverters(status); #endif #else /* Do any required init for services that don't have open operations * and use "only" the double-check initialization method for performance * reasons (avoiding a mutex lock even for _checking_ whether the * initialization had occurred). */ /* Char Properties */ uprv_haveProperties(status); /* load the case and bidi properties but don't fail if they are not available */ u_isULowercase(0x61); u_getIntPropertyValue(0x200D, UCHAR_JOINING_TYPE); /* ZERO WIDTH JOINER: Join_Causing */ #if !UCONFIG_NO_NORMALIZATION /* Normalization */ unorm_haveData(status); #endif #endif gICUInitialized = TRUE; /* TODO: don't set if U_FAILURE? */ umtx_unlock(&gICUInitMutex); UTRACE_EXIT_STATUS(*status); }
static double calc_score_for_char(MatchInfo *m, UChar32 last, UChar32 current, int32_t distance_from_last_match) { double factor = 1.0; double ans = m->max_score_per_char; if (u_strchr32(m->level1, last) != NULL) factor = 0.9; else if (u_strchr32(m->level2, last) != NULL) factor = 0.8; else if (u_isULowercase(last) && u_isUUppercase(current)) factor = 0.8; // CamelCase else if (u_strchr32(m->level3, last) != NULL) factor = 0.7; else // If last is not a special char, factor diminishes // as distance from last matched char increases factor = (1.0 / distance_from_last_match) * 0.75; return ans * factor; }
symbol_type operator()(const symbol_type& symbol) const { const std::string& word = static_cast<const std::string&>(symbol); icu::UnicodeString uword = icu::UnicodeString::fromUTF8(icu::StringPiece(word.data(), word.size())); std::string signature = "<unk"; // signature for English, taken from Stanford parser's getSignature5 int num_caps = 0; bool has_digit = false; bool has_dash = false; bool has_lower = false; bool has_punct = false; bool has_symbol = false; size_t length = 0; UChar32 ch0 = 0; UChar32 ch_1 = 0; UChar32 ch_2 = 0; icu::StringCharacterIterator iter(uword); for (iter.setToStart(); iter.hasNext(); ++ length) { const UChar32 ch = iter.next32PostInc(); // keep initial char... if (ch0 == 0) ch0 = ch; ch_2 = ch_1; ch_1 = ch; const int32_t gc = u_getIntPropertyValue(ch, UCHAR_GENERAL_CATEGORY_MASK); has_dash |= ((gc & U_GC_PD_MASK) != 0); has_punct |= ((gc & U_GC_P_MASK) != 0); has_symbol |= ((gc & U_GC_S_MASK) != 0); has_digit |= (u_getNumericValue(ch) != U_NO_NUMERIC_VALUE); if (u_isUAlphabetic(ch)) { if (u_isULowercase(ch)) has_lower = true; else if (u_istitle(ch)) { has_lower = true; ++ num_caps; } else ++ num_caps; } } // transform into lower... uword.toLower(); ch_2 = (ch_2 ? u_tolower(ch_2) : ch_2); ch_1 = (ch_1 ? u_tolower(ch_1) : ch_1); // we do not check loc... if (u_isUUppercase(ch0) || u_istitle(ch0)) signature += "-caps"; else if (! u_isUAlphabetic(ch0) && num_caps) signature += "-caps"; else if (has_lower) signature += "-lc"; if (has_digit) signature += "-num"; if (has_dash) signature += "-dash"; if (has_punct) signature += "-punct"; if (has_symbol) signature += "-sym"; if (length >= 3 && ch_1 == 's') { if (ch_2 != 's' && ch_2 != 'i' && ch_2 != 'u') signature += "-s"; } else if (length >= 5 && ! has_dash && ! (has_digit && num_caps > 0)) { if (uword.endsWith("ed")) signature += "-ed"; else if (uword.endsWith("ing")) signature += "-ing"; else if (uword.endsWith("ion")) signature += "-ion"; else if (uword.endsWith("er")) signature += "-er"; else if (uword.endsWith("est")) signature += "-est"; else if (uword.endsWith("ly")) signature += "-ly"; else if (uword.endsWith("ity")) signature += "-ity"; else if (uword.endsWith("y")) signature += "-y"; else if (uword.endsWith("al")) signature += "-al"; } signature += '>'; return signature; }
// Check if a code point has the Lowercase Unicode property (UCHAR_LOWERCASE). bool BUnicodeChar::IsLower(uint32 c) { BUnicodeChar(); return u_isULowercase(c); }