// Computes the augmented script set for a code point, according to UTS 39 section 5.1. void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) { result.resetAll(); result.setScriptExtensions(codePoint, status); if (U_FAILURE(status)) { return; } // Section 5.1 step 1 if (result.test(USCRIPT_HAN, status)) { result.set(USCRIPT_HAN_WITH_BOPOMOFO, status); result.set(USCRIPT_JAPANESE, status); result.set(USCRIPT_KOREAN, status); } if (result.test(USCRIPT_HIRAGANA, status)) { result.set(USCRIPT_JAPANESE, status); } if (result.test(USCRIPT_KATAKANA, status)) { result.set(USCRIPT_JAPANESE, status); } if (result.test(USCRIPT_HANGUL, status)) { result.set(USCRIPT_KOREAN, status); } if (result.test(USCRIPT_BOPOMOFO, status)) { result.set(USCRIPT_HAN_WITH_BOPOMOFO, status); } // Section 5.1 step 2 if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) { result.setAll(); } }
// Computes the restriction level of a string, according to UTS 39 section 5.2. URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const { // Section 5.2 step 1: if (!fAllowedCharsSet->containsAll(input)) { return USPOOF_UNRESTRICTIVE; } // Section 5.2 step 2 // Java use a static UnicodeSet for this test. In C++, avoid the static variable // and just do a simple for loop. UBool allASCII = TRUE; for (int32_t i=0, length=input.length(); i<length; i++) { if (input.charAt(i) > 0x7f) { allASCII = FALSE; break; } } if (allASCII) { return USPOOF_ASCII; } // Section 5.2 steps 3: ScriptSet resolvedScriptSet; getResolvedScriptSet(input, resolvedScriptSet, status); if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; } // Section 5.2 step 4: if (!resolvedScriptSet.isEmpty()) { return USPOOF_SINGLE_SCRIPT_RESTRICTIVE; } // Section 5.2 step 5: ScriptSet resolvedNoLatn; getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status); if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; } // Section 5.2 step 6: if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status) || resolvedNoLatn.test(USCRIPT_JAPANESE, status) || resolvedNoLatn.test(USCRIPT_KOREAN, status)) { return USPOOF_HIGHLY_RESTRICTIVE; } // Section 5.2 step 7: if (!resolvedNoLatn.isEmpty() && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status) && !resolvedNoLatn.test(USCRIPT_GREEK, status) && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) { return USPOOF_MODERATELY_RESTRICTIVE; } // Section 5.2 step 8: return USPOOF_MINIMALLY_RESTRICTIVE; }
// Computes the resolved script set for a string, omitting characters having the specified script. // If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included. void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const { result.setAll(); ScriptSet temp; UChar32 codePoint; for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) { codePoint = input.char32At(i); // Compute the augmented script set for the character getAugmentedScriptSet(codePoint, temp, status); if (U_FAILURE(status)) { return; } // Intersect the augmented script set with the resolved script set, but only if the character doesn't // have the script specified in the function call if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) { result.intersect(temp); } } }