//static jint Character_getNumericValueImpl(JNIEnv*, jclass, jint codePoint){ JNIEXPORT jint JNICALL Java_java_lang_Character_getNumericValueImpl(JNIEnv*, jclass, jint codePoint){ // The letters A-Z in their uppercase ('\u0041' through '\u005A'), // lowercase ('\u0061' through '\u007A'), // and full width variant ('\uFF21' through '\uFF3A' // and '\uFF41' through '\uFF5A') forms // have numeric values from 10 through 35. This is independent of the // Unicode specification, which does not assign numeric values to these // char values. if (codePoint >= 0x41 && codePoint <= 0x5A) { return codePoint - 0x37; } if (codePoint >= 0x61 && codePoint <= 0x7A) { return codePoint - 0x57; } if (codePoint >= 0xFF21 && codePoint <= 0xFF3A) { return codePoint - 0xFF17; } if (codePoint >= 0xFF41 && codePoint <= 0xFF5A) { return codePoint - 0xFF37; } double result = u_getNumericValue(codePoint); if (result == U_NO_NUMERIC_VALUE) { return -1; } else if (result < 0 || floor(result + 0.5) != result) { return -2; } return result; }
static jint Character_getNumericValueImpl(JNIEnv*, jclass, jint codePoint) { double result = u_getNumericValue(codePoint); if (result == U_NO_NUMERIC_VALUE) { return -1; } else if (result < 0 || floor(result + 0.5) != result) { return -2; } return static_cast<jint>(result); }
// Computes the set of numerics for a string, according to UTS 39 section 5.3. void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const { result.clear(); UChar32 codePoint; for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) { codePoint = input.char32At(i); // Store a representative character for each kind of decimal digit if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) { // Store the zero character as a representative for comparison. // Unicode guarantees it is codePoint - value result.add(codePoint - (UChar32)u_getNumericValue(codePoint)); } } }
symbol_type operator()(const symbol_type& symbol) const { const std::string& word = static_cast<const std::string&>(symbol); icu::UnicodeString uword = icu::UnicodeString::fromUTF8(icu::StringPiece(word.data(), word.size())); Unicode& impl = const_cast<Unicode&>(*this); bool dg = false; uint32_t gc = 0; uscript_type sc(script_.size(), false); icu::StringCharacterIterator iter(uword); for (iter.setToStart(); iter.hasNext(); /**/) { const UChar32 ch = iter.next32PostInc(); dg |= (u_getNumericValue(ch) != U_NO_NUMERIC_VALUE); gc |= u_getIntPropertyValue(ch, UCHAR_GENERAL_CATEGORY_MASK); sc[u_getIntPropertyValue(ch, UCHAR_SCRIPT)] = true; } std::string signature = "<unk"; for (int i = 1; i < U_CHAR_CATEGORY_COUNT; ++ i) if (gc & U_MASK(i)) { signature += "-"; signature += general_category_[i]; } for (int i = 1; i < USCRIPT_CODE_LIMIT; ++ i) if (sc[i]) { signature += "-"; signature += script_[i]; } if (dg) signature += "-NUM"; signature += '>'; return signature; }
Variant HHVM_STATIC_METHOD(IntlChar, getNumericValue, const Variant& arg) { GETCP(arg, cp); return u_getNumericValue(cp); }
IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) { if (U_FAILURE(status)) { return *this; } *fIdentifier = identifier; clear(); ScriptSet scriptsForCP; UChar32 cp; for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) { cp = identifier.char32At(i); // Store a representative character for each kind of decimal digit if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) { // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value fNumerics->add(cp - (UChar32)u_getNumericValue(cp)); } UScriptCode extensions[500]; int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, UPRV_LENGTHOF(extensions), &status); if (U_FAILURE(status)) { return *this; } scriptsForCP.resetAll(); for (int32_t j=0; j<extensionsCount; j++) { scriptsForCP.set(extensions[j], status); } scriptsForCP.reset(USCRIPT_COMMON, status); scriptsForCP.reset(USCRIPT_INHERITED, status); switch (scriptsForCP.countMembers()) { case 0: break; case 1: // Single script, record it. fRequiredScripts->Union(scriptsForCP); break; default: if (!fRequiredScripts->intersects(scriptsForCP) && !uhash_geti(fScriptSetSet, &scriptsForCP)) { // If the set hasn't been added already, add it // (Add a copy, fScriptSetSet takes ownership of the copy.) uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status); } break; } } // Now make a final pass through ScriptSetSet to remove alternates that came before singles. // [Kana], [Kana Hira] => [Kana] // This is relatively infrequent, so doesn't have to be optimized. // We also compute any commonalities among the alternates. if (uhash_count(fScriptSetSet) > 0) { fCommonAmongAlternates->setAll(); for (int32_t it = UHASH_FIRST;;) { const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it); if (nextHashEl == NULL) { break; } ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer); // [Kana], [Kana Hira] => [Kana] if (fRequiredScripts->intersects(*next)) { uhash_removeElement(fScriptSetSet, nextHashEl); } else { fCommonAmongAlternates->intersect(*next); // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] for (int32_t otherIt = UHASH_FIRST;;) { const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt); if (otherHashEl == NULL) { break; } ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer); if (next != other && next->contains(*other)) { uhash_removeElement(fScriptSetSet, nextHashEl); break; } } } } } if (uhash_count(fScriptSetSet) == 0) { fCommonAmongAlternates->resetAll(); } return *this; }
symbol_type operator()(const symbol_type& symbol) const { const std::string& word = static_cast<const std::string&>(symbol); icu::UnicodeString uword = icu::UnicodeString::fromUTF8(icu::StringPiece(word.data(), word.size())); std::string signature = "<unk"; // signature for English, taken from Stanford parser's getSignature5 int num_caps = 0; bool has_digit = false; bool has_dash = false; bool has_lower = false; bool has_punct = false; bool has_symbol = false; size_t length = 0; UChar32 ch0 = 0; UChar32 ch_1 = 0; UChar32 ch_2 = 0; icu::StringCharacterIterator iter(uword); for (iter.setToStart(); iter.hasNext(); ++ length) { const UChar32 ch = iter.next32PostInc(); // keep initial char... if (ch0 == 0) ch0 = ch; ch_2 = ch_1; ch_1 = ch; const int32_t gc = u_getIntPropertyValue(ch, UCHAR_GENERAL_CATEGORY_MASK); has_dash |= ((gc & U_GC_PD_MASK) != 0); has_punct |= ((gc & U_GC_P_MASK) != 0); has_symbol |= ((gc & U_GC_S_MASK) != 0); has_digit |= (u_getNumericValue(ch) != U_NO_NUMERIC_VALUE); if (u_isUAlphabetic(ch)) { if (u_isULowercase(ch)) has_lower = true; else if (u_istitle(ch)) { has_lower = true; ++ num_caps; } else ++ num_caps; } } // transform into lower... uword.toLower(); ch_2 = (ch_2 ? u_tolower(ch_2) : ch_2); ch_1 = (ch_1 ? u_tolower(ch_1) : ch_1); // we do not check loc... if (u_isUUppercase(ch0) || u_istitle(ch0)) signature += "-caps"; else if (! u_isUAlphabetic(ch0) && num_caps) signature += "-caps"; else if (has_lower) signature += "-lc"; if (has_digit) signature += "-num"; if (has_dash) signature += "-dash"; if (has_punct) signature += "-punct"; if (has_symbol) signature += "-sym"; if (length >= 3 && ch_1 == 's') { if (ch_2 != 's' && ch_2 != 'i' && ch_2 != 'u') signature += "-s"; } else if (length >= 5 && ! has_dash && ! (has_digit && num_caps > 0)) { if (uword.endsWith("ed")) signature += "-ed"; else if (uword.endsWith("ing")) signature += "-ing"; else if (uword.endsWith("ion")) signature += "-ion"; else if (uword.endsWith("er")) signature += "-er"; else if (uword.endsWith("est")) signature += "-est"; else if (uword.endsWith("ly")) signature += "-ly"; else if (uword.endsWith("ity")) signature += "-ity"; else if (uword.endsWith("y")) signature += "-y"; else if (uword.endsWith("al")) signature += "-al"; } signature += '>'; return signature; }
static UBool numericValueFilter(UChar32 ch, void* context) { return u_getNumericValue(ch) == *(double*)context; }
double __hs_u_getNumericValue(UChar32 c) { return u_getNumericValue(c); }