void ScriptAdd (HWND hwnd) { ScriptManager& sm = ScriptManager::Get(); ScriptSet* ss = sm.edPtr->tData->scripts; if (!ss) return; // Make sure we actually "OK"ed the action int retcode = DialogBoxParam(hInstLib, MAKEINTRESOURCE(IDD_NEW), hwnd, NameDialogProc, (LPARAM)"Add Script"); if (retcode != IDOK) return; // Ferret out duplicate names for (ScriptSet::IScript it = ss->scripts.begin(); it != ss->scripts.end(); it++) { if ((*it)->name.compare(sm.tempName) == 0) { MessageBox(hwnd, "A script with this name already exists.", "Duplicate Name", MB_ICONWARNING); return; } } // Start working on our newly added script ss->curScript = ss->AddScript(sm.tempName); ScriptSelChange(hwnd); SendDlgItemMessage(hwnd, IDC_LIST_SCRIPT, LB_ADDSTRING, 0, (LPARAM)sm.tempName.c_str()); }
void ScriptDelete (HWND hwnd) { ScriptManager& sm = ScriptManager::Get(); ScriptSet* ss = sm.edPtr->tData->scripts; if (!ss) return; // Is anything selected for deletion? int sel = SendDlgItemMessage(hwnd, IDC_LIST_SCRIPT, LB_GETCURSEL, 0, 0); if (sel == LB_ERR) return; // Find out what's selected, delete it int slen = SendDlgItemMessage(hwnd, IDC_LIST_SCRIPT, LB_GETTEXTLEN, sel, 0); char* buf = new char[slen + 1]; SendDlgItemMessage(hwnd, IDC_LIST_SCRIPT, LB_GETTEXT, sel, (LPARAM)buf); for (ScriptSet::IScript it = ss->scripts.begin(); it != ss->scripts.end(); it++) { if ((*it)->name.compare(buf) == 0) { ss->RemScript(buf); // We just deleted our current script! ss->curScript = 0; SendDlgItemMessage(hwnd, IDC_LIST_SCRIPT, LB_DELETESTRING, sel, 0); ScriptSelChange(hwnd); break; } } delete[] buf; }
ScriptSet &ScriptSet::intersect(UScriptCode script, UErrorCode &status) { ScriptSet t; t.set(script, status); if (U_SUCCESS(status)) { this->intersect(t); } return *this; }
// Computes the restriction level of a string, according to UTS 39 section 5.2. URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const { // Section 5.2 step 1: if (!fAllowedCharsSet->containsAll(input)) { return USPOOF_UNRESTRICTIVE; } // Section 5.2 step 2 // Java use a static UnicodeSet for this test. In C++, avoid the static variable // and just do a simple for loop. UBool allASCII = TRUE; for (int32_t i=0, length=input.length(); i<length; i++) { if (input.charAt(i) > 0x7f) { allASCII = FALSE; break; } } if (allASCII) { return USPOOF_ASCII; } // Section 5.2 steps 3: ScriptSet resolvedScriptSet; getResolvedScriptSet(input, resolvedScriptSet, status); if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; } // Section 5.2 step 4: if (!resolvedScriptSet.isEmpty()) { return USPOOF_SINGLE_SCRIPT_RESTRICTIVE; } // Section 5.2 step 5: ScriptSet resolvedNoLatn; getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status); if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; } // Section 5.2 step 6: if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status) || resolvedNoLatn.test(USCRIPT_JAPANESE, status) || resolvedNoLatn.test(USCRIPT_KOREAN, status)) { return USPOOF_HIGHLY_RESTRICTIVE; } // Section 5.2 step 7: if (!resolvedNoLatn.isEmpty() && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status) && !resolvedNoLatn.test(USCRIPT_GREEK, status) && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) { return USPOOF_MODERATELY_RESTRICTIVE; } // Section 5.2 step 8: return USPOOF_MINIMALLY_RESTRICTIVE; }
UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const { if (!container.contains(containee)) { return FALSE; } for (int32_t iter = UHASH_FIRST; ;) { const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter); if (hashEl == NULL) { break; } ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer); if (!container.intersects(*alternatives)) { return false; } } return true; }
// Computes the resolved script set for a string, omitting characters having the specified script. // If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included. void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const { result.setAll(); ScriptSet temp; UChar32 codePoint; for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) { codePoint = input.char32At(i); // Compute the augmented script set for the character getAugmentedScriptSet(codePoint, temp, status); if (U_FAILURE(status)) { return; } // Intersect the augmented script set with the resolved script set, but only if the character doesn't // have the script specified in the function call if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) { result.intersect(temp); } } }
UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) { UVector sorted(status); if (U_FAILURE(status)) { return dest; } for (int32_t pos = UHASH_FIRST; ;) { const UHashElement *el = uhash_nextElement(alternates, &pos); if (el == NULL) { break; } ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer); sorted.addElement(ss, status); } sorted.sort(uhash_compareScriptSet, status); UnicodeString separator = UNICODE_STRING_SIMPLE("; "); for (int32_t i=0; i<sorted.size(); i++) { if (i>0) { dest.append(separator); } ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i)); ss->displayScripts(dest); } return dest; }
IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) { if (U_FAILURE(status)) { return *this; } *fIdentifier = identifier; clear(); ScriptSet scriptsForCP; UChar32 cp; for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) { cp = identifier.char32At(i); // Store a representative character for each kind of decimal digit if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) { // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value fNumerics->add(cp - (UChar32)u_getNumericValue(cp)); } UScriptCode extensions[500]; int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, UPRV_LENGTHOF(extensions), &status); if (U_FAILURE(status)) { return *this; } scriptsForCP.resetAll(); for (int32_t j=0; j<extensionsCount; j++) { scriptsForCP.set(extensions[j], status); } scriptsForCP.reset(USCRIPT_COMMON, status); scriptsForCP.reset(USCRIPT_INHERITED, status); switch (scriptsForCP.countMembers()) { case 0: break; case 1: // Single script, record it. fRequiredScripts->Union(scriptsForCP); break; default: if (!fRequiredScripts->intersects(scriptsForCP) && !uhash_geti(fScriptSetSet, &scriptsForCP)) { // If the set hasn't been added already, add it // (Add a copy, fScriptSetSet takes ownership of the copy.) uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status); } break; } } // Now make a final pass through ScriptSetSet to remove alternates that came before singles. // [Kana], [Kana Hira] => [Kana] // This is relatively infrequent, so doesn't have to be optimized. // We also compute any commonalities among the alternates. if (uhash_count(fScriptSetSet) > 0) { fCommonAmongAlternates->setAll(); for (int32_t it = UHASH_FIRST;;) { const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it); if (nextHashEl == NULL) { break; } ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer); // [Kana], [Kana Hira] => [Kana] if (fRequiredScripts->intersects(*next)) { uhash_removeElement(fScriptSetSet, nextHashEl); } else { fCommonAmongAlternates->intersect(*next); // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] for (int32_t otherIt = UHASH_FIRST;;) { const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt); if (otherHashEl == NULL) { break; } ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer); if (next != other && next->contains(*other)) { uhash_removeElement(fScriptSetSet, nextHashEl); break; } } } } } if (uhash_count(fScriptSetSet) == 0) { fCommonAmongAlternates->resetAll(); } return *this; }
U_CAPI int32_t U_EXPORT2 uspoof_check(const USpoofChecker *sc, const UChar *text, int32_t length, int32_t *position, UErrorCode *status) { const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); if (This == NULL) { return 0; } if (length < -1) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } if (length == -1) { // It's not worth the bother to handle nul terminated strings everywhere. // Just get the length and be done with it. length = u_strlen(text); } int32_t result = 0; int32_t failPos = 0x7fffffff; // TODO: do we have a #define for max int32? // A count of the number of non-Common or inherited scripts. // Needed for both the SINGLE_SCRIPT and the WHOLE/MIXED_SCIRPT_CONFUSABLE tests. // Share the computation when possible. scriptCount == -1 means that we haven't // done it yet. int32_t scriptCount = -1; if ((This->fChecks) & USPOOF_SINGLE_SCRIPT) { scriptCount = This->scriptScan(text, length, failPos, *status); // printf("scriptCount (clipped to 2) = %d\n", scriptCount); if ( scriptCount >= 2) { // Note: scriptCount == 2 covers all cases of the number of scripts >= 2 result |= USPOOF_SINGLE_SCRIPT; } } if (This->fChecks & USPOOF_CHAR_LIMIT) { int32_t i; UChar32 c; for (i=0; i<length ;) { U16_NEXT(text, i, length, c); if (!This->fAllowedCharsSet->contains(c)) { result |= USPOOF_CHAR_LIMIT; if (i < failPos) { failPos = i; } break; } } } if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) { // These are the checks that need to be done on NFD input NFDBuffer normalizedInput(text, length, *status); const UChar *nfdText = normalizedInput.getBuffer(); int32_t nfdLength = normalizedInput.getLength(); if (This->fChecks & USPOOF_INVISIBLE) { // scan for more than one occurence of the same non-spacing mark // in a sequence of non-spacing marks. int32_t i; UChar32 c; UChar32 firstNonspacingMark = 0; UBool haveMultipleMarks = FALSE; UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence. for (i=0; i<nfdLength ;) { U16_NEXT(nfdText, i, nfdLength, c); if (u_charType(c) != U_NON_SPACING_MARK) { firstNonspacingMark = 0; if (haveMultipleMarks) { marksSeenSoFar.clear(); haveMultipleMarks = FALSE; } continue; } if (firstNonspacingMark == 0) { firstNonspacingMark = c; continue; } if (!haveMultipleMarks) { marksSeenSoFar.add(firstNonspacingMark); haveMultipleMarks = TRUE; } if (marksSeenSoFar.contains(c)) { // report the error, and stop scanning. // No need to find more than the first failure. result |= USPOOF_INVISIBLE; failPos = i; // TODO: Bug 8655: failPos is the position in the NFD buffer, but what we want // to give back to our caller is a position in the original input string. if (failPos > length) { failPos = length; } break; } marksSeenSoFar.add(c); } } if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) { // The basic test is the same for both whole and mixed script confusables. // Compute the set of scripts that every input character has a confusable in. // For this computation an input character is always considered to be // confusable with itself in its own script. // If the number of such scripts is two or more, and the input consisted of // characters all from a single script, we have a whole script confusable. // (The two scripts will be the original script and the one that is confusable) // If the number of such scripts >= one, and the original input contained characters from // more than one script, we have a mixed script confusable. (We can transform // some of the characters, and end up with a visually similar string all in // one script.) if (scriptCount == -1) { int32_t t; scriptCount = This->scriptScan(text, length, t, *status); } ScriptSet scripts; This->wholeScriptCheck(nfdText, nfdLength, &scripts, *status); int32_t confusableScriptCount = scripts.countMembers(); //printf("confusableScriptCount = %d\n", confusableScriptCount); if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) && confusableScriptCount >= 2 && scriptCount == 1) { result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE; } if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) && confusableScriptCount >= 1 && scriptCount > 1) { result |= USPOOF_MIXED_SCRIPT_CONFUSABLE; } } } if (position != NULL && failPos != 0x7fffffff) { *position = failPos; } return result; }
// Computes the augmented script set for a code point, according to UTS 39 section 5.1. void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) { result.resetAll(); result.setScriptExtensions(codePoint, status); if (U_FAILURE(status)) { return; } // Section 5.1 step 1 if (result.test(USCRIPT_HAN, status)) { result.set(USCRIPT_HAN_WITH_BOPOMOFO, status); result.set(USCRIPT_JAPANESE, status); result.set(USCRIPT_KOREAN, status); } if (result.test(USCRIPT_HIRAGANA, status)) { result.set(USCRIPT_JAPANESE, status); } if (result.test(USCRIPT_KATAKANA, status)) { result.set(USCRIPT_JAPANESE, status); } if (result.test(USCRIPT_HANGUL, status)) { result.set(USCRIPT_KOREAN, status); } if (result.test(USCRIPT_BOPOMOFO, status)) { result.set(USCRIPT_HAN_WITH_BOPOMOFO, status); } // Section 5.1 step 2 if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) { result.setAll(); } }
U_CAPI int32_t U_EXPORT2 uspoof_checkUnicodeString(const USpoofChecker *sc, const icu::UnicodeString &id, int32_t *position, UErrorCode *status) { const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); if (This == NULL) { return 0; } int32_t result = 0; IdentifierInfo *identifierInfo = NULL; if ((This->fChecks) & (USPOOF_RESTRICTION_LEVEL | USPOOF_MIXED_NUMBERS)) { identifierInfo = This->getIdentifierInfo(*status); if (U_FAILURE(*status)) { goto cleanupAndReturn; } identifierInfo->setIdentifier(id, *status); identifierInfo->setIdentifierProfile(*This->fAllowedCharsSet); } if ((This->fChecks) & USPOOF_RESTRICTION_LEVEL) { URestrictionLevel idRestrictionLevel = identifierInfo->getRestrictionLevel(*status); if (idRestrictionLevel > This->fRestrictionLevel) { result |= USPOOF_RESTRICTION_LEVEL; } if (This->fChecks & USPOOF_AUX_INFO) { result |= idRestrictionLevel; } } if ((This->fChecks) & USPOOF_MIXED_NUMBERS) { const UnicodeSet *numerics = identifierInfo->getNumerics(); if (numerics->size() > 1) { result |= USPOOF_MIXED_NUMBERS; } // TODO: ICU4J returns the UnicodeSet of the numerics found in the identifier. // We have no easy way to do the same in C. // if (checkResult != null) { // checkResult.numerics = numerics; // } } if (This->fChecks & (USPOOF_CHAR_LIMIT)) { int32_t i; UChar32 c; int32_t length = id.length(); for (i=0; i<length ;) { c = id.char32At(i); i += U16_LENGTH(c); if (!This->fAllowedCharsSet->contains(c)) { result |= USPOOF_CHAR_LIMIT; break; } } } if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) { // These are the checks that need to be done on NFD input UnicodeString nfdText; gNfdNormalizer->normalize(id, nfdText, *status); int32_t nfdLength = nfdText.length(); if (This->fChecks & USPOOF_INVISIBLE) { // scan for more than one occurence of the same non-spacing mark // in a sequence of non-spacing marks. int32_t i; UChar32 c; UChar32 firstNonspacingMark = 0; UBool haveMultipleMarks = FALSE; UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence. for (i=0; i<nfdLength ;) { c = nfdText.char32At(i); i += U16_LENGTH(c); if (u_charType(c) != U_NON_SPACING_MARK) { firstNonspacingMark = 0; if (haveMultipleMarks) { marksSeenSoFar.clear(); haveMultipleMarks = FALSE; } continue; } if (firstNonspacingMark == 0) { firstNonspacingMark = c; continue; } if (!haveMultipleMarks) { marksSeenSoFar.add(firstNonspacingMark); haveMultipleMarks = TRUE; } if (marksSeenSoFar.contains(c)) { // report the error, and stop scanning. // No need to find more than the first failure. result |= USPOOF_INVISIBLE; break; } marksSeenSoFar.add(c); } } if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) { // The basic test is the same for both whole and mixed script confusables. // Compute the set of scripts that every input character has a confusable in. // For this computation an input character is always considered to be // confusable with itself in its own script. // // If the number of such scripts is two or more, and the input consisted of // characters all from a single script, we have a whole script confusable. // (The two scripts will be the original script and the one that is confusable) // // If the number of such scripts >= one, and the original input contained characters from // more than one script, we have a mixed script confusable. (We can transform // some of the characters, and end up with a visually similar string all in // one script.) if (identifierInfo == NULL) { identifierInfo = This->getIdentifierInfo(*status); if (U_FAILURE(*status)) { goto cleanupAndReturn; } identifierInfo->setIdentifier(id, *status); } int32_t scriptCount = identifierInfo->getScriptCount(); ScriptSet scripts; This->wholeScriptCheck(nfdText, &scripts, *status); int32_t confusableScriptCount = scripts.countMembers(); //printf("confusableScriptCount = %d\n", confusableScriptCount); if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) && confusableScriptCount >= 2 && scriptCount == 1) { result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE; } if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) && confusableScriptCount >= 1 && scriptCount > 1) { result |= USPOOF_MIXED_SCRIPT_CONFUSABLE; } } } cleanupAndReturn: This->releaseIdentifierInfo(identifierInfo); if (position != NULL) { *position = 0; } return result; }