void FICUCamelCaseBreakIterator::TokenizeString(TArray<FToken>& OutTokens)
{
	OutTokens.Empty(String.Len());

	FICUTextCharacterIterator CharIter(String);
	for(CharIter.setToStart(); CharIter.current32() != FICUTextCharacterIterator::DONE; CharIter.next32PostInc())
	{
		const UChar32 CurrentChar = CharIter.current32();

		ETokenType TokenType = ETokenType::Other;
		if(u_isULowercase(CurrentChar))
		{
			TokenType = ETokenType::Lowercase;
		}
		else if(u_isUUppercase(CurrentChar))
		{
			TokenType = ETokenType::Uppercase;
		}
		else if(u_isdigit(CurrentChar))
		{
			TokenType = ETokenType::Digit;
		}

		const int32 CharIndex = CharIter.InternalIndexToSourceIndex(CharIter.getIndex());
		OutTokens.Emplace(FToken(TokenType, CharIndex));
	}

	OutTokens.Emplace(FToken(ETokenType::Null, String.Len()));

	// There should always be at least one token for the end of the string
	check(OutTokens.Num());
}
Beispiel #2
0
U_CAPI void U_EXPORT2
u_init(UErrorCode *status) {
    UTRACE_ENTRY_OC(UTRACE_U_INIT);
    /* Make sure the global mutexes are initialized. */
    umtx_init(NULL);
    umtx_lock(&gICUInitMutex);
    if (gICUInitialized || U_FAILURE(*status)) {
        umtx_unlock(&gICUInitMutex);
        UTRACE_EXIT_STATUS(*status);
        return;
    }

#if 1
    /*
     * 2005-may-02
     *
     * ICU4C 3.4 (jitterbug 4497) hardcodes the data for Unicode character
     * properties for APIs that want to be fast.
     * Therefore, we need not load them here nor check for errors.
     * Instead, we load the converter alias table to see if any ICU data
     * is available.
     * Users should really open the service objects they need and check
     * for errors there, to make sure that the actual items they need are
     * available.
     */
#if !UCONFIG_NO_CONVERSION
    ucnv_io_countKnownConverters(status);
#endif
#else
    /* Do any required init for services that don't have open operations
     * and use "only" the double-check initialization method for performance
     * reasons (avoiding a mutex lock even for _checking_ whether the
     * initialization had occurred).
     */

    /* Char Properties */
    uprv_haveProperties(status);

    /* load the case and bidi properties but don't fail if they are not available */
    u_isULowercase(0x61);
    u_getIntPropertyValue(0x200D, UCHAR_JOINING_TYPE); /* ZERO WIDTH JOINER: Join_Causing */

#if !UCONFIG_NO_NORMALIZATION
    /*  Normalization  */
    unorm_haveData(status);
#endif
#endif
    gICUInitialized = TRUE;    /* TODO:  don't set if U_FAILURE? */
    umtx_unlock(&gICUInitMutex);
    UTRACE_EXIT_STATUS(*status);
}
Beispiel #3
0
static double calc_score_for_char(MatchInfo *m, UChar32 last, UChar32 current, int32_t distance_from_last_match) {
    double factor = 1.0;
    double ans = m->max_score_per_char;

    if (u_strchr32(m->level1, last) != NULL)
        factor = 0.9;
    else if (u_strchr32(m->level2, last) != NULL)
        factor = 0.8;
    else if (u_isULowercase(last) && u_isUUppercase(current))
        factor = 0.8;  // CamelCase
    else if (u_strchr32(m->level3, last) != NULL)
        factor = 0.7;
    else
        // If last is not a special char, factor diminishes
        // as distance from last matched char increases
        factor = (1.0 / distance_from_last_match) * 0.75;
    return ans * factor;
}
Beispiel #4
0
      symbol_type operator()(const symbol_type& symbol) const
      {
	const std::string& word = static_cast<const std::string&>(symbol);
	icu::UnicodeString uword = icu::UnicodeString::fromUTF8(icu::StringPiece(word.data(), word.size()));
	
	std::string signature = "<unk";
	
	// signature for English, taken from Stanford parser's getSignature5
	int num_caps = 0;
	bool has_digit  = false;
	bool has_dash   = false;
	bool has_lower  = false;
	bool has_punct  = false;
	bool has_symbol = false;
	
	size_t length = 0;
	UChar32 ch0 = 0;
	UChar32 ch_1 = 0;
	UChar32 ch_2 = 0;
	
	icu::StringCharacterIterator iter(uword);
	for (iter.setToStart(); iter.hasNext(); ++ length) {
	  const UChar32 ch = iter.next32PostInc();
	  
	  // keep initial char...
	  if (ch0 == 0)
	    ch0 = ch;
	  
	  ch_2 = ch_1;
	  ch_1 = ch;
	  
	  const int32_t gc = u_getIntPropertyValue(ch, UCHAR_GENERAL_CATEGORY_MASK);
	  
	  has_dash   |= ((gc & U_GC_PD_MASK) != 0);
	  has_punct  |= ((gc & U_GC_P_MASK) != 0);
	  has_symbol |= ((gc & U_GC_S_MASK) != 0);
	  
	  has_digit  |= (u_getNumericValue(ch) != U_NO_NUMERIC_VALUE);
	  
	  if (u_isUAlphabetic(ch)) {
	    if (u_isULowercase(ch))
	      has_lower = true;
	    else if (u_istitle(ch)) {
	      has_lower = true;
	      ++ num_caps;
	    } else
	      ++ num_caps;
	  }
	}
	
	// transform into lower...
	uword.toLower();
	ch_2 = (ch_2 ? u_tolower(ch_2) : ch_2);
	ch_1 = (ch_1 ? u_tolower(ch_1) : ch_1);
	
	// we do not check loc...
	if (u_isUUppercase(ch0) || u_istitle(ch0))
	  signature += "-caps";
	else if (! u_isUAlphabetic(ch0) && num_caps)
	  signature += "-caps";
	else if (has_lower)
	  signature += "-lc";
      
	if (has_digit)
	  signature += "-num";
	if (has_dash)
	  signature += "-dash";
	if (has_punct)
	  signature += "-punct";
	if (has_symbol)
	  signature += "-sym";
      
	if (length >= 3 && ch_1 == 's') {
	  if (ch_2 != 's' && ch_2 != 'i' && ch_2 != 'u')
	    signature += "-s";
	} else if (length >= 5 && ! has_dash && ! (has_digit && num_caps > 0)) {
	  if (uword.endsWith("ed"))
	    signature += "-ed";
	  else if (uword.endsWith("ing"))
	    signature += "-ing";
	  else if (uword.endsWith("ion"))
	    signature += "-ion";
	  else if (uword.endsWith("er"))
	    signature += "-er";
	  else if (uword.endsWith("est"))
	    signature += "-est";
	  else if (uword.endsWith("ly"))
	    signature += "-ly";
	  else if (uword.endsWith("ity"))
	    signature += "-ity";
	  else if (uword.endsWith("y"))
	    signature += "-y";
	  else if (uword.endsWith("al"))
	    signature += "-al";
	}
      
	signature += '>';
	
	return signature;
      }
Beispiel #5
0
// Check if a code point has the Lowercase Unicode property (UCHAR_LOWERCASE).
bool
BUnicodeChar::IsLower(uint32 c)
{
	BUnicodeChar();
	return u_isULowercase(c);
}