// This is mostly a port of the code in WebCore/editing/SmartReplaceCF.cpp // except we use icu in place of CoreFoundations character classes. static USet* getSmartSet(bool isPreviousCharacter) { static USet* preSmartSet = nullptr; static USet* postSmartSet = nullptr; USet* smartSet = isPreviousCharacter ? preSmartSet : postSmartSet; if (!smartSet) { // Whitespace and newline (kCFCharacterSetWhitespaceAndNewline) UErrorCode ec = U_ZERO_ERROR; String whitespaceAndNewline("[[:WSpace:] [\\u000A\\u000B\\u000C\\u000D\\u0085]]"); smartSet = uset_openPattern(whitespaceAndNewline.charactersWithNullTermination().data(), whitespaceAndNewline.length(), &ec); ASSERT(U_SUCCESS(ec)); // CJK ranges uset_addRange(smartSet, 0x1100, 0x1100 + 256); // Hangul Jamo (0x1100 - 0x11FF) uset_addRange(smartSet, 0x2E80, 0x2E80 + 352); // CJK & Kangxi Radicals (0x2E80 - 0x2FDF) uset_addRange(smartSet, 0x2FF0, 0x2FF0 + 464); // Ideograph Descriptions, CJK Symbols, Hiragana, Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun, & Bopomofo Ext (0x2FF0 - 0x31BF) uset_addRange(smartSet, 0x3200, 0x3200 + 29392); // Enclosed CJK, CJK Ideographs (Uni Han & Ext A), & Yi (0x3200 - 0xA4CF) uset_addRange(smartSet, 0xAC00, 0xAC00 + 11183); // Hangul Syllables (0xAC00 - 0xD7AF) uset_addRange(smartSet, 0xF900, 0xF900 + 352); // CJK Compatibility Ideographs (0xF900 - 0xFA5F) uset_addRange(smartSet, 0xFE30, 0xFE30 + 32); // CJK Compatibility From (0xFE30 - 0xFE4F) uset_addRange(smartSet, 0xFF00, 0xFF00 + 240); // Half/Full Width Form (0xFF00 - 0xFFEF) uset_addRange(smartSet, 0x20000, 0x20000 + 0xA6D7); // CJK Ideograph Exntension B uset_addRange(smartSet, 0x2F800, 0x2F800 + 0x021E); // CJK Compatibility Ideographs (0x2F800 - 0x2FA1D) if (isPreviousCharacter) { addAllCodePoints(smartSet, "([\"\'#$/-`{"); preSmartSet = smartSet; } else { addAllCodePoints(smartSet, ")].,;:?\'!\"%*-/}"); // Punctuation (kCFCharacterSetPunctuation) UErrorCode ec = U_ZERO_ERROR; String punctuationClass("[:P:]"); USet* icuPunct = uset_openPattern(punctuationClass.charactersWithNullTermination().data(), punctuationClass.length(), &ec); ASSERT(U_SUCCESS(ec)); uset_addAll(smartSet, icuPunct); uset_close(icuPunct); postSmartSet = smartSet; } } return smartSet; }
static void TestFreezable() { USet *idSet; USet *frozen; USet *thawed; idSet=openIDSet(); if (idSet == NULL) { log_data_err("openIDSet() returned NULL. (Are you missing data?)\n"); uset_close(idSet); return; } frozen=uset_clone(idSet); if (frozen == NULL) { log_err("uset_Clone() returned NULL\n"); return; } if(!uset_equals(frozen, idSet)) { log_err("uset_clone() did not make an equal copy\n"); } uset_freeze(frozen); uset_addRange(frozen, 0xd802, 0xd805); if(uset_isFrozen(idSet) || !uset_isFrozen(frozen) || !uset_equals(frozen, idSet)) { log_err("uset_freeze() or uset_isFrozen() does not work\n"); } thawed=uset_cloneAsThawed(frozen); if (thawed == NULL) { log_err("uset_cloneAsThawed(frozen) returned NULL"); uset_close(frozen); uset_close(idSet); return; } uset_addRange(thawed, 0xd802, 0xd805); if(uset_isFrozen(thawed) || uset_equals(thawed, idSet) || !uset_containsRange(thawed, 0xd802, 0xd805)) { log_err("uset_cloneAsThawed() does not work\n"); } uset_close(idSet); uset_close(frozen); uset_close(thawed); }
static void generateSelectorData(UConverterSelector* result, UPropsVectors *upvec, const USet* excludedCodePoints, const UConverterUnicodeSet whichSet, UErrorCode* status) { if (U_FAILURE(*status)) { return; } int32_t columns = (result->encodingsCount+31)/32; // set errorValue to all-ones for (int32_t col = 0; col < columns; col++) { upvec_setValue(upvec, UPVEC_ERROR_VALUE_CP, UPVEC_ERROR_VALUE_CP, col, static_cast<uint32_t>(~0), static_cast<uint32_t>(~0), status); } for (int32_t i = 0; i < result->encodingsCount; ++i) { uint32_t mask; uint32_t column; int32_t item_count; int32_t j; UConverter* test_converter = ucnv_open(result->encodings[i], status); if (U_FAILURE(*status)) { return; } USet* unicode_point_set; unicode_point_set = uset_open(1, 0); // empty set ucnv_getUnicodeSet(test_converter, unicode_point_set, whichSet, status); if (U_FAILURE(*status)) { ucnv_close(test_converter); return; } column = i / 32; mask = 1 << (i%32); // now iterate over intervals on set i! item_count = uset_getItemCount(unicode_point_set); for (j = 0; j < item_count; ++j) { UChar32 start_char; UChar32 end_char; UErrorCode smallStatus = U_ZERO_ERROR; uset_getItem(unicode_point_set, j, &start_char, &end_char, NULL, 0, &smallStatus); if (U_FAILURE(smallStatus)) { // this will be reached for the converters that fill the set with // strings. Those should be ignored by our system } else { upvec_setValue(upvec, start_char, end_char, column, static_cast<uint32_t>(~0), mask, status); } } ucnv_close(test_converter); uset_close(unicode_point_set); if (U_FAILURE(*status)) { return; } } // handle excluded encodings! Simply set their values to all 1's in the upvec if (excludedCodePoints) { int32_t item_count = uset_getItemCount(excludedCodePoints); for (int32_t j = 0; j < item_count; ++j) { UChar32 start_char; UChar32 end_char; uset_getItem(excludedCodePoints, j, &start_char, &end_char, NULL, 0, status); for (int32_t col = 0; col < columns; col++) { upvec_setValue(upvec, start_char, end_char, col, static_cast<uint32_t>(~0), static_cast<uint32_t>(~0), status); } } } // alright. Now, let's put things in the same exact form you'd get when you // unserialize things. result->trie = upvec_compactToUTrie2WithRowIndexes(upvec, status); result->pv = upvec_cloneArray(upvec, &result->pvCount, NULL, status); result->pvCount *= columns; // number of uint32_t = rows * columns result->ownPv = TRUE; }
void SSearchTest::monkeyTest(char *params) { // ook! UErrorCode status = U_ZERO_ERROR; //UCollator *coll = ucol_open(NULL, &status); UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status); if (U_FAILURE(status)) { errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status)); return; } CollData *monkeyData = new CollData(coll, status); USet *expansions = uset_openEmpty(); USet *contractions = uset_openEmpty(); ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); USet *letters = uset_openPattern(letter_pattern, 39, &status); SetMonkey letterMonkey(letters); StringSetMonkey contractionMonkey(contractions, coll, monkeyData); StringSetMonkey expansionMonkey(expansions, coll, monkeyData); UnicodeString testCase; UnicodeString alternate; UnicodeString pattern, altPattern; UnicodeString prefix, altPrefix; UnicodeString suffix, altSuffix; Monkey *monkeys[] = { &letterMonkey, &contractionMonkey, &expansionMonkey, &contractionMonkey, &expansionMonkey, &contractionMonkey, &expansionMonkey, &contractionMonkey, &expansionMonkey}; int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]); // int32_t nonMatchCount = 0; UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY}; const char *strengthNames[] = {"primary", "secondary", "tertiary"}; int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]); int32_t loopCount = quick? 1000 : 10000; int32_t firstStrength = 0; int32_t lastStrength = strengthCount - 1; //*/ 0; if (params != NULL) { #if !UCONFIG_NO_REGULAR_EXPRESSIONS UnicodeString p(params); loopCount = getIntParam("loop", p, loopCount); m_seed = getIntParam("seed", p, m_seed); RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status); if (m.find()) { UnicodeString breakType = m.group(1, status); for (int32_t s = 0; s < strengthCount; s += 1) { if (breakType == strengthNames[s]) { firstStrength = lastStrength = s; break; } } m.reset(); p = m.replaceFirst("", status); } if (RegexMatcher("\\S", p, 0, status).find()) { // Each option is stripped out of the option string as it is processed. // All options have been checked. The option string should have been completely emptied.. char buf[100]; p.extract(buf, sizeof(buf), NULL, status); buf[sizeof(buf)-1] = 0; errln("Unrecognized or extra parameter: %s\n", buf); return; } #else infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters."); #endif } for(int32_t s = firstStrength; s <= lastStrength; s += 1) { int32_t notFoundCount = 0; logln("Setting strength to %s.", strengthNames[s]); ucol_setStrength(coll, strengths[s]); // TODO: try alternate prefix and suffix too? // TODO: alterntaes are only equal at primary strength. Is this OK? for(int32_t t = 0; t < loopCount; t += 1) { uint32_t seed = m_seed; // int32_t nmc = 0; generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern); generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix); generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix); // pattern notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed); testCase.remove(); testCase.append(prefix); testCase.append(/*alt*/pattern); // prefix + pattern notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed); testCase.append(suffix); // prefix + pattern + suffix notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed); testCase.remove(); testCase.append(pattern); testCase.append(suffix); // pattern + suffix notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed); } logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount); } uset_close(contractions); uset_close(expansions); uset_close(letters); delete monkeyData; ucol_close(coll); }
CollData::CollData(UCollator *collator, UErrorCode &status) : coll(NULL), ceToCharsStartingWith(NULL) { // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]] // i.e. other, control, private use, format, surrogate U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20); U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20); USet *charsToTest = uset_openPattern(test_pattern, 20, &status); // Han ext. A, Han, Jamo, Hangul, Han Ext. B // i.e. all the characers we handle implicitly U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status); if (U_FAILURE(status)) { return; } USet *expansions = uset_openEmpty(); USet *contractions = uset_openEmpty(); int32_t itemCount; ceToCharsStartingWith = new CEToStringsMap(status); if (U_FAILURE(status)) { goto bail; } #ifdef CLONE_COLLATOR coll = ucol_safeClone(collator, NULL, NULL, &status); if (U_FAILURE(status)) { goto bail; } #else coll = collator; #endif ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); uset_addAll(charsToTest, contractions); uset_addAll(charsToTest, expansions); uset_removeAll(charsToTest, charsToRemove); itemCount = uset_getItemCount(charsToTest); for(int32_t item = 0; item < itemCount; item += 1) { UChar32 start = 0, end = 0; UChar buffer[16]; int32_t len = uset_getItem(charsToTest, item, &start, &end, buffer, 16, &status); if (len == 0) { for (UChar32 ch = start; ch <= end; ch += 1) { UnicodeString *st = new UnicodeString(ch); if (st == NULL) { status = U_MEMORY_ALLOCATION_ERROR; break; } CEList *ceList = new CEList(coll, *st, status); ceToCharsStartingWith->put(ceList->get(0), st, status); delete ceList; delete st; } } else if (len > 0) { UnicodeString *st = new UnicodeString(buffer, len); if (st == NULL) { status = U_MEMORY_ALLOCATION_ERROR; break; } CEList *ceList = new CEList(coll, *st, status); ceToCharsStartingWith->put(ceList->get(0), st, status); delete ceList; delete st; } else { // shouldn't happen... } if (U_FAILURE(status)) { break; } } bail: uset_close(contractions); uset_close(expansions); uset_close(charsToRemove); uset_close(charsToTest); if (U_FAILURE(status)) { return; } UnicodeSet hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status); if (U_FAILURE(status)) { return; } UnicodeSetIterator hanIter(hanRanges); UnicodeString hanString; while(hanIter.nextRange()) { hanString.append(hanIter.getCodepoint()); hanString.append(hanIter.getCodepointEnd()); } // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9, // but as of Unicode 6.3 the 11xx block is filled, // and there are also more Jamo T at U+D7CB..U+D7FB. // Maybe use [:HST=T:] and look for the end of the last range? // Maybe use script boundary mappings instead of this code?? UChar jamoRanges[] = {Hangul::JAMO_L_BASE, Hangul::JAMO_V_BASE, Hangul::JAMO_T_BASE + 1, 0x11FF}; UnicodeString jamoString(FALSE, jamoRanges, UPRV_LENGTHOF(jamoRanges)); CEList hanList(coll, hanString, status); CEList jamoList(coll, jamoString, status); int32_t j = 0; if (U_FAILURE(status)) { return; } for (int32_t c = 0; c < jamoList.size(); c += 1) { uint32_t jce = jamoList[c]; if (! isContinuation(jce)) { jamoLimits[j++] = jce; } } jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT); minHan = 0xFFFFFFFF; maxHan = 0; for(int32_t h = 0; h < hanList.size(); h += 2) { uint32_t han = (uint32_t) hanList[h]; if (han < minHan) { minHan = han; } if (han > maxHan) { maxHan = han; } } maxHan += (1 << UCOL_PRIMARYORDERSHIFT); }
static void TestUSpoofCAPI(void) { /* * basic uspoof_open(). */ { USpoofChecker *sc; UErrorCode status = U_ZERO_ERROR; sc = uspoof_open(&status); TEST_ASSERT_SUCCESS(status); if (U_FAILURE(status)) { /* If things are so broken that we can't even open a default spoof checker, */ /* don't even try the rest of the tests. They would all fail. */ return; } uspoof_close(sc); } /* * Test Open from source rules. */ TEST_SETUP const char *dataSrcDir; char *fileName; char *confusables; int confusablesLength; char *confusablesWholeScript; int confusablesWholeScriptLength; FILE *f; UParseError pe; int32_t errType; USpoofChecker *rsc; dataSrcDir = ctest_dataSrcDir(); fileName = malloc(strlen(dataSrcDir) + 100); strcpy(fileName, dataSrcDir); strcat(fileName, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "confusables.txt"); f = fopen(fileName, "r"); TEST_ASSERT_NE(f, NULL); confusables = malloc(3000000); confusablesLength = fread(confusables, 1, 3000000, f); fclose(f); strcpy(fileName, dataSrcDir); strcat(fileName, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "confusablesWholeScript.txt"); f = fopen(fileName, "r"); TEST_ASSERT_NE(f, NULL); confusablesWholeScript = malloc(1000000); confusablesWholeScriptLength = fread(confusablesWholeScript, 1, 1000000, f); fclose(f); rsc = uspoof_openFromSource(confusables, confusablesLength, confusablesWholeScript, confusablesWholeScriptLength, &errType, &pe, &status); TEST_ASSERT_SUCCESS(status); free(confusablesWholeScript); free(confusables); free(fileName); uspoof_close(rsc); /* printf("ParseError Line is %d\n", pe.line); */ TEST_TEARDOWN; /* * openFromSerialized and serialize */ TEST_SETUP int32_t serializedSize = 0; int32_t actualLength = 0; char *buf; USpoofChecker *sc2; int32_t checkResults; serializedSize = uspoof_serialize(sc, NULL, 0, &status); TEST_ASSERT_EQ(status, U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT(serializedSize > 0); /* Serialize the default spoof checker */ status = U_ZERO_ERROR; buf = (char *)malloc(serializedSize + 10); TEST_ASSERT(buf != NULL); buf[serializedSize] = 42; uspoof_serialize(sc, buf, serializedSize, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(42, buf[serializedSize]); /* Create a new spoof checker from the freshly serialized data */ sc2 = uspoof_openFromSerialized(buf, serializedSize+10, &actualLength, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_NE(NULL, sc2); TEST_ASSERT_EQ(serializedSize, actualLength); /* Verify that the new spoof checker at least wiggles */ checkResults = uspoof_check(sc2, goodLatin, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); checkResults = uspoof_check(sc2, scMixed, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults); uspoof_close(sc2); free(buf); TEST_TEARDOWN; /* * Set & Get Check Flags */ TEST_SETUP int32_t t; uspoof_setChecks(sc, USPOOF_ALL_CHECKS, &status); TEST_ASSERT_SUCCESS(status); t = uspoof_getChecks(sc, &status); TEST_ASSERT_EQ(t, USPOOF_ALL_CHECKS); uspoof_setChecks(sc, 0, &status); TEST_ASSERT_SUCCESS(status); t = uspoof_getChecks(sc, &status); TEST_ASSERT_EQ(0, t); uspoof_setChecks(sc, USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE, &status); TEST_ASSERT_SUCCESS(status); t = uspoof_getChecks(sc, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE, t); TEST_TEARDOWN; /* * get & setAllowedChars */ TEST_SETUP USet *us; const USet *uset; uset = uspoof_getAllowedChars(sc, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(uset_isFrozen(uset)); us = uset_open((UChar32)0x41, (UChar32)0x5A); /* [A-Z] */ uspoof_setAllowedChars(sc, us, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_NE(us, uspoof_getAllowedChars(sc, &status)); TEST_ASSERT(uset_equals(us, uspoof_getAllowedChars(sc, &status))); TEST_ASSERT_SUCCESS(status); uset_close(us); TEST_TEARDOWN; /* * clone() */ TEST_SETUP USpoofChecker *clone1 = NULL; USpoofChecker *clone2 = NULL; int32_t checkResults = 0; clone1 = uspoof_clone(sc, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_NE(clone1, sc); clone2 = uspoof_clone(clone1, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_NE(clone2, clone1); uspoof_close(clone1); /* Verify that the cloned spoof checker is alive */ checkResults = uspoof_check(clone2, goodLatin, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); checkResults = uspoof_check(clone2, scMixed, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults); uspoof_close(clone2); TEST_TEARDOWN; /* * get & set Checks */ TEST_SETUP int32_t checks; int32_t checks2; int32_t checkResults; checks = uspoof_getChecks(sc, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_ALL_CHECKS, checks); checks &= ~(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE); uspoof_setChecks(sc, checks, &status); TEST_ASSERT_SUCCESS(status); checks2 = uspoof_getChecks(sc, &status); TEST_ASSERT_EQ(checks, checks2); /* The checks that were disabled just above are the same ones that the "scMixed" test fails. So with those tests gone checking that Identifier should now succeed */ checkResults = uspoof_check(sc, scMixed, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); TEST_TEARDOWN; /* * AllowedLoacles */ TEST_SETUP const char *allowedLocales; int32_t checkResults; /* Default allowed locales list should be empty */ allowedLocales = uspoof_getAllowedLocales(sc, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(strcmp("", allowedLocales) == 0) /* Allow en and ru, which should enable Latin and Cyrillic only to pass */ uspoof_setAllowedLocales(sc, "en, ru_RU", &status); TEST_ASSERT_SUCCESS(status); allowedLocales = uspoof_getAllowedLocales(sc, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(strstr(allowedLocales, "en") != NULL); TEST_ASSERT(strstr(allowedLocales, "ru") != NULL); /* Limit checks to USPOOF_CHAR_LIMIT. Some of the test data has whole script confusables also, * which we don't want to see in this test. */ uspoof_setChecks(sc, USPOOF_CHAR_LIMIT, &status); TEST_ASSERT_SUCCESS(status); checkResults = uspoof_check(sc, goodLatin, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT, checkResults); checkResults = uspoof_check(sc, goodCyrl, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); /* Reset with an empty locale list, which should allow all characters to pass */ uspoof_setAllowedLocales(sc, " ", &status); TEST_ASSERT_SUCCESS(status); checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); TEST_TEARDOWN; /* * AllowedChars set/get the USet of allowed characters. */ TEST_SETUP const USet *set; USet *tmpSet; int32_t checkResults; /* By default, we should see no restriction; the USet should allow all characters. */ set = uspoof_getAllowedChars(sc, &status); TEST_ASSERT_SUCCESS(status); tmpSet = uset_open(0, 0x10ffff); TEST_ASSERT(uset_equals(tmpSet, set)); /* Setting the allowed chars should enable the check. */ uspoof_setChecks(sc, USPOOF_ALL_CHECKS & ~USPOOF_CHAR_LIMIT, &status); TEST_ASSERT_SUCCESS(status); /* Remove a character that is in our good Latin test identifier from the allowed chars set. */ uset_remove(tmpSet, goodLatin[1]); uspoof_setAllowedChars(sc, tmpSet, &status); TEST_ASSERT_SUCCESS(status); uset_close(tmpSet); /* Latin Identifier should now fail; other non-latin test cases should still be OK */ checkResults = uspoof_check(sc, goodLatin, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT, checkResults); checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults); TEST_TEARDOWN; /* * check UTF-8 */ TEST_SETUP char utf8buf[200]; int32_t checkResults; int32_t position; u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodLatin, -1, &status); TEST_ASSERT_SUCCESS(status); position = 666; checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); TEST_ASSERT_EQ(666, position); u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodCyrl, -1, &status); TEST_ASSERT_SUCCESS(status); checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, scMixed, -1, &status); TEST_ASSERT_SUCCESS(status); position = 666; checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_SINGLE_SCRIPT , checkResults); TEST_ASSERT_EQ(2, position); TEST_TEARDOWN; /* * uspoof_areConfusable() */ TEST_SETUP int32_t checkResults; checkResults = uspoof_areConfusable(sc, scLatin, -1, scMixed, -1, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults); checkResults = uspoof_areConfusable(sc, goodGreek, -1, scLatin, -1, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); checkResults = uspoof_areConfusable(sc, lll_Latin_a, -1, lll_Latin_b, -1, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults); TEST_TEARDOWN; /* * areConfusableUTF8 */ TEST_SETUP int32_t checkResults; char s1[200]; char s2[200]; u_strToUTF8(s1, sizeof(s1), NULL, scLatin, -1, &status); u_strToUTF8(s2, sizeof(s2), NULL, scMixed, -1, &status); TEST_ASSERT_SUCCESS(status); checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults); u_strToUTF8(s1, sizeof(s1), NULL, goodGreek, -1, &status); u_strToUTF8(s2, sizeof(s2), NULL, scLatin, -1, &status); TEST_ASSERT_SUCCESS(status); checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); u_strToUTF8(s1, sizeof(s1), NULL, lll_Latin_a, -1, &status); u_strToUTF8(s2, sizeof(s2), NULL, lll_Latin_b, -1, &status); TEST_ASSERT_SUCCESS(status); checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults); TEST_TEARDOWN; /* * getSkeleton */ TEST_SETUP UChar dest[100]; int32_t skelLength; skelLength = uspoof_getSkeleton(sc, USPOOF_ANY_CASE, lll_Latin_a, -1, dest, sizeof(dest)/sizeof(UChar), &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, u_strcmp(lll_Skel, dest)); TEST_ASSERT_EQ(u_strlen(lll_Skel), skelLength); skelLength = uspoof_getSkeletonUTF8(sc, USPOOF_ANY_CASE, goodLatinUTF8, -1, dest, sizeof(dest)/sizeof(UChar), &status); TEST_ASSERT_SUCCESS(status); skelLength = uspoof_getSkeleton(sc, USPOOF_ANY_CASE, lll_Latin_a, -1, NULL, 0, &status); TEST_ASSERT_EQ(U_BUFFER_OVERFLOW_ERROR, status); TEST_ASSERT_EQ(3, skelLength); status = U_ZERO_ERROR; TEST_TEARDOWN; }
static void TestSelector() { TestText text; USet* excluded_sets[3] = { NULL }; int32_t i, testCaseIdx; if (!getAvailableNames()) { return; } if (!text_open(&text)) { releaseAvailableNames();; } excluded_sets[0] = uset_openEmpty(); for(i = 1 ; i < 3 ; i++) { excluded_sets[i] = uset_open(i*30, i*30+500); } for(testCaseIdx = 0; testCaseIdx < UPRV_LENGTHOF(getEncodingsFns); testCaseIdx++) { int32_t excluded_set_id; int32_t num_encodings; const char **encodings = getEncodingsFns[testCaseIdx](&num_encodings); if (getTestOption(QUICK_OPTION) && num_encodings > 25) { uprv_free((void *)encodings); continue; } /* * for(excluded_set_id = 0 ; excluded_set_id < 3 ; excluded_set_id++) * * This loop was replaced by the following statement because * the loop made the test run longer without adding to the code coverage. * The handling of the exclusion set is independent of the * set of encodings, so there is no need to test every combination. */ excluded_set_id = testCaseIdx % UPRV_LENGTHOF(excluded_sets); { UConverterSelector *sel_rt, *sel_fb; char *buffer_fb = NULL; UErrorCode status = U_ZERO_ERROR; sel_rt = ucnvsel_open(encodings, num_encodings, excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_SET, &status); if (num_encodings == gCountAvailable) { /* test the special "all converters" parameter values */ sel_fb = ucnvsel_open(NULL, 0, excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status); } else if (uset_isEmpty(excluded_sets[excluded_set_id])) { /* test that a NULL set gives the same results as an empty set */ sel_fb = ucnvsel_open(encodings, num_encodings, NULL, UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status); } else { sel_fb = ucnvsel_open(encodings, num_encodings, excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status); } if (U_FAILURE(status)) { log_err("ucnv_sel_open(encodings %ld) failed - %s\n", testCaseIdx, u_errorName(status)); ucnvsel_close(sel_rt); uprv_free((void *)encodings); continue; } text_reset(&text); for (;;) { UBool *manual_rt, *manual_fb; static UChar utf16[10000]; char *s; int32_t length8, length16; s = text_nextString(&text, &length8); if (s == NULL || (getTestOption(QUICK_OPTION) && text.number > 3)) { break; } manual_rt = getResultsManually(encodings, num_encodings, s, length8, excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_SET); manual_fb = getResultsManually(encodings, num_encodings, s, length8, excluded_sets[excluded_set_id], UCNV_ROUNDTRIP_AND_FALLBACK_SET); /* UTF-8 with length */ status = U_ZERO_ERROR; verifyResult(ucnvsel_selectForUTF8(sel_rt, s, length8, &status), manual_rt); verifyResult(ucnvsel_selectForUTF8(sel_fb, s, length8, &status), manual_fb); /* UTF-8 NUL-terminated */ verifyResult(ucnvsel_selectForUTF8(sel_rt, s, -1, &status), manual_rt); verifyResult(ucnvsel_selectForUTF8(sel_fb, s, -1, &status), manual_fb); u_strFromUTF8(utf16, UPRV_LENGTHOF(utf16), &length16, s, length8, &status); if (U_FAILURE(status)) { log_err("error converting the test text (string %ld) to UTF-16 - %s\n", (long)text.number, u_errorName(status)); } else { if (text.number == 0) { sel_fb = serializeAndUnserialize(sel_fb, &buffer_fb, &status); } if (U_SUCCESS(status)) { /* UTF-16 with length */ verifyResult(ucnvsel_selectForString(sel_rt, utf16, length16, &status), manual_rt); verifyResult(ucnvsel_selectForString(sel_fb, utf16, length16, &status), manual_fb); /* UTF-16 NUL-terminated */ verifyResult(ucnvsel_selectForString(sel_rt, utf16, -1, &status), manual_rt); verifyResult(ucnvsel_selectForString(sel_fb, utf16, -1, &status), manual_fb); } } uprv_free(manual_rt); uprv_free(manual_fb); } ucnvsel_close(sel_rt); ucnvsel_close(sel_fb); uprv_free(buffer_fb); } uprv_free((void *)encodings); } releaseAvailableNames(); text_close(&text); for(i = 0 ; i < 3 ; i++) { uset_close(excluded_sets[i]); } }
/** * Basic API test for uset.x */ static void TestAPI() { USet* set; USet* set2; UErrorCode ec; /* [] */ set = uset_openEmpty(); expect(set, "", "abc{ab}", NULL); uset_close(set); set = uset_open(1, 0); expect(set, "", "abc{ab}", NULL); uset_close(set); set = uset_open(1, 1); uset_clear(set); expect(set, "", "abc{ab}", NULL); uset_close(set); /* [ABC] */ set = uset_open(0x0041, 0x0043); expect(set, "ABC", "DEF{ab}", NULL); uset_close(set); /* [a-c{ab}] */ ec = U_ZERO_ERROR; set = uset_openPattern(PAT, PAT_LEN, &ec); if(U_FAILURE(ec)) { log_err("uset_openPattern([a-c{ab}]) failed - %s\n", u_errorName(ec)); return; } if(!uset_resemblesPattern(PAT, PAT_LEN, 0)) { log_err("uset_resemblesPattern of PAT failed\n"); } expect(set, "abc{ab}", "def{bc}", &ec); /* [a-d{ab}] */ uset_add(set, 0x64); expect(set, "abcd{ab}", "ef{bc}", NULL); /* [acd{ab}{bc}] */ uset_remove(set, 0x62); uset_addString(set, STR_bc, STR_bc_LEN); expect(set, "acd{ab}{bc}", "bef{cd}", NULL); /* [acd{bc}] */ uset_removeString(set, STR_ab, STR_ab_LEN); expect(set, "acd{bc}", "bfg{ab}", NULL); /* [^acd{bc}] */ uset_complement(set); expect(set, "bef{bc}", "acd{ac}", NULL); /* [a-e{bc}] */ uset_complement(set); uset_addRange(set, 0x0062, 0x0065); expect(set, "abcde{bc}", "fg{ab}", NULL); /* [de{bc}] */ uset_removeRange(set, 0x0050, 0x0063); expect(set, "de{bc}", "bcfg{ab}", NULL); /* [g-l] */ uset_set(set, 0x0067, 0x006C); expect(set, "ghijkl", "de{bc}", NULL); if (uset_indexOf(set, 0x0067) != 0) { log_err("uset_indexOf failed finding correct index of 'g'\n"); } if (uset_charAt(set, 0) != 0x0067) { log_err("uset_charAt failed finding correct char 'g' at index 0\n"); } /* How to test this one...? */ uset_compact(set); /* [g-i] */ uset_retain(set, 0x0067, 0x0069); expect(set, "ghi", "dejkl{bc}", NULL); /* UCHAR_ASCII_HEX_DIGIT */ uset_applyIntPropertyValue(set, UCHAR_ASCII_HEX_DIGIT, 1, &ec); if(U_FAILURE(ec)) { log_err("uset_applyIntPropertyValue([UCHAR_ASCII_HEX_DIGIT]) failed - %s\n", u_errorName(ec)); return; } expect(set, "0123456789ABCDEFabcdef", "GHIjkl{bc}", NULL); /* [ab] */ uset_clear(set); uset_addAllCodePoints(set, STR_ab, STR_ab_LEN); expect(set, "ab", "def{ab}", NULL); if (uset_containsAllCodePoints(set, STR_bc, STR_bc_LEN)){ log_err("set should not conatin all characters of \"bc\" \n"); } /* [] */ set2 = uset_open(1, 1); uset_clear(set2); /* space */ uset_applyPropertyAlias(set2, PAT_lb, PAT_lb_LEN, VAL_SP, VAL_SP_LEN, &ec); expect(set2, " ", "abcdefghi{bc}", NULL); /* [a-c] */ uset_set(set2, 0x0061, 0x0063); /* [g-i] */ uset_set(set, 0x0067, 0x0069); /* [a-c g-i] */ if (uset_containsSome(set, set2)) { log_err("set should not contain some of set2 yet\n"); } uset_complementAll(set, set2); if (!uset_containsSome(set, set2)) { log_err("set should contain some of set2\n"); } expect(set, "abcghi", "def{bc}", NULL); /* [g-i] */ uset_removeAll(set, set2); expect(set, "ghi", "abcdef{bc}", NULL); /* [a-c g-i] */ uset_addAll(set2, set); expect(set2, "abcghi", "def{bc}", NULL); /* [g-i] */ uset_retainAll(set2, set); expect(set2, "ghi", "abcdef{bc}", NULL); uset_close(set); uset_close(set2); }
/* * Spoof Detection C API Tests */ static void TestUSpoofCAPI(void) { /* * basic uspoof_open(). */ { USpoofChecker *sc; UErrorCode status = U_ZERO_ERROR; sc = uspoof_open(&status); TEST_ASSERT_SUCCESS(status); if (U_FAILURE(status)) { /* If things are so broken that we can't even open a default spoof checker, */ /* don't even try the rest of the tests. They would all fail. */ return; } uspoof_close(sc); } /* * openFromSerialized and serialize */ TEST_SETUP int32_t serializedSize = 0; int32_t actualLength = 0; char *buf; USpoofChecker *sc2; int32_t checkResults; serializedSize = uspoof_serialize(sc, NULL, 0, &status); TEST_ASSERT_EQ(status, U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT(serializedSize > 0); /* Serialize the default spoof checker */ status = U_ZERO_ERROR; buf = (char *)malloc(serializedSize + 10); TEST_ASSERT(buf != NULL); buf[serializedSize] = 42; uspoof_serialize(sc, buf, serializedSize, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(42, buf[serializedSize]); /* Create a new spoof checker from the freshly serialized data */ sc2 = uspoof_openFromSerialized(buf, serializedSize+10, &actualLength, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_NE(NULL, sc2); TEST_ASSERT_EQ(serializedSize, actualLength); /* Verify that the new spoof checker at least wiggles */ checkResults = uspoof_check(sc2, goodLatin, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); checkResults = uspoof_check(sc2, scMixed, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults); uspoof_close(sc2); free(buf); TEST_TEARDOWN; /* * Set & Get Check Flags */ TEST_SETUP int32_t t; uspoof_setChecks(sc, USPOOF_ALL_CHECKS, &status); TEST_ASSERT_SUCCESS(status); t = uspoof_getChecks(sc, &status); TEST_ASSERT_EQ(t, USPOOF_ALL_CHECKS); uspoof_setChecks(sc, 0, &status); TEST_ASSERT_SUCCESS(status); t = uspoof_getChecks(sc, &status); TEST_ASSERT_EQ(0, t); uspoof_setChecks(sc, USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE, &status); TEST_ASSERT_SUCCESS(status); t = uspoof_getChecks(sc, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE, t); TEST_TEARDOWN; /* * get & setAllowedChars */ TEST_SETUP USet *us; const USet *uset; uset = uspoof_getAllowedChars(sc, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(uset_isFrozen(uset)); us = uset_open((UChar32)0x41, (UChar32)0x5A); /* [A-Z] */ uspoof_setAllowedChars(sc, us, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_NE(us, uspoof_getAllowedChars(sc, &status)); TEST_ASSERT(uset_equals(us, uspoof_getAllowedChars(sc, &status))); TEST_ASSERT_SUCCESS(status); uset_close(us); TEST_TEARDOWN; /* * clone() */ TEST_SETUP USpoofChecker *clone1 = NULL; USpoofChecker *clone2 = NULL; int32_t checkResults = 0; clone1 = uspoof_clone(sc, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_NE(clone1, sc); clone2 = uspoof_clone(clone1, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_NE(clone2, clone1); uspoof_close(clone1); /* Verify that the cloned spoof checker is alive */ checkResults = uspoof_check(clone2, goodLatin, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); checkResults = uspoof_check(clone2, scMixed, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults); uspoof_close(clone2); TEST_TEARDOWN; /* * basic uspoof_check() */ TEST_SETUP int32_t result; result = uspoof_check(sc, goodLatin, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, result); result = uspoof_check(sc, han_Hiragana, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, result); result = uspoof_check(sc, scMixed, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, result); TEST_TEARDOWN /* * get & set Checks */ TEST_SETUP int32_t checks; int32_t checks2; int32_t checkResults; checks = uspoof_getChecks(sc, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_ALL_CHECKS, checks); checks &= ~(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE); uspoof_setChecks(sc, checks, &status); TEST_ASSERT_SUCCESS(status); checks2 = uspoof_getChecks(sc, &status); TEST_ASSERT_EQ(checks, checks2); /* The checks that were disabled just above are the same ones that the "scMixed" test fails. So with those tests gone checking that Identifier should now succeed */ checkResults = uspoof_check(sc, scMixed, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); TEST_TEARDOWN; /* * AllowedLoacles */ TEST_SETUP const char *allowedLocales; int32_t checkResults; /* Default allowed locales list should be empty */ allowedLocales = uspoof_getAllowedLocales(sc, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(strcmp("", allowedLocales) == 0) /* Allow en and ru, which should enable Latin and Cyrillic only to pass */ uspoof_setAllowedLocales(sc, "en, ru_RU", &status); TEST_ASSERT_SUCCESS(status); allowedLocales = uspoof_getAllowedLocales(sc, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(strstr(allowedLocales, "en") != NULL); TEST_ASSERT(strstr(allowedLocales, "ru") != NULL); /* Limit checks to USPOOF_CHAR_LIMIT. Some of the test data has whole script confusables also, * which we don't want to see in this test. */ uspoof_setChecks(sc, USPOOF_CHAR_LIMIT, &status); TEST_ASSERT_SUCCESS(status); checkResults = uspoof_check(sc, goodLatin, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT, checkResults); checkResults = uspoof_check(sc, goodCyrl, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); /* Reset with an empty locale list, which should allow all characters to pass */ uspoof_setAllowedLocales(sc, " ", &status); TEST_ASSERT_SUCCESS(status); checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); TEST_TEARDOWN; /* * AllowedChars set/get the USet of allowed characters. */ TEST_SETUP const USet *set; USet *tmpSet; int32_t checkResults; /* By default, we should see no restriction; the USet should allow all characters. */ set = uspoof_getAllowedChars(sc, &status); TEST_ASSERT_SUCCESS(status); tmpSet = uset_open(0, 0x10ffff); TEST_ASSERT(uset_equals(tmpSet, set)); /* Setting the allowed chars should enable the check. */ uspoof_setChecks(sc, USPOOF_ALL_CHECKS & ~USPOOF_CHAR_LIMIT, &status); TEST_ASSERT_SUCCESS(status); /* Remove a character that is in our good Latin test identifier from the allowed chars set. */ uset_remove(tmpSet, goodLatin[1]); uspoof_setAllowedChars(sc, tmpSet, &status); TEST_ASSERT_SUCCESS(status); uset_close(tmpSet); /* Latin Identifier should now fail; other non-latin test cases should still be OK * Note: fail of CHAR_LIMIT also causes the restriction level to be USPOOF_UNRESTRICTIVE * which will give us a USPOOF_RESTRICTION_LEVEL failure. */ checkResults = uspoof_check(sc, goodLatin, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT | USPOOF_RESTRICTION_LEVEL, checkResults); checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults); TEST_TEARDOWN; /* * check UTF-8 */ TEST_SETUP char utf8buf[200]; int32_t checkResults; int32_t position; u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodLatin, -1, &status); TEST_ASSERT_SUCCESS(status); position = 666; checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); TEST_ASSERT_EQ(0, position); u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodCyrl, -1, &status); TEST_ASSERT_SUCCESS(status); checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, scMixed, -1, &status); TEST_ASSERT_SUCCESS(status); position = 666; checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_SINGLE_SCRIPT , checkResults); TEST_ASSERT_EQ(0, position); TEST_TEARDOWN; /* * uspoof_areConfusable() */ TEST_SETUP int32_t checkResults; checkResults = uspoof_areConfusable(sc, scLatin, -1, scMixed, -1, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults); checkResults = uspoof_areConfusable(sc, goodGreek, -1, scLatin, -1, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); checkResults = uspoof_areConfusable(sc, lll_Latin_a, -1, lll_Latin_b, -1, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults); TEST_TEARDOWN; /* * areConfusableUTF8 */ TEST_SETUP int32_t checkResults; char s1[200]; char s2[200]; u_strToUTF8(s1, sizeof(s1), NULL, scLatin, -1, &status); u_strToUTF8(s2, sizeof(s2), NULL, scMixed, -1, &status); TEST_ASSERT_SUCCESS(status); checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults); u_strToUTF8(s1, sizeof(s1), NULL, goodGreek, -1, &status); u_strToUTF8(s2, sizeof(s2), NULL, scLatin, -1, &status); TEST_ASSERT_SUCCESS(status); checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); u_strToUTF8(s1, sizeof(s1), NULL, lll_Latin_a, -1, &status); u_strToUTF8(s2, sizeof(s2), NULL, lll_Latin_b, -1, &status); TEST_ASSERT_SUCCESS(status); checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults); TEST_TEARDOWN; /* * getSkeleton */ TEST_SETUP UChar dest[100]; int32_t skelLength; skelLength = uspoof_getSkeleton(sc, USPOOF_ANY_CASE, lll_Latin_a, -1, dest, UPRV_LENGTHOF(dest), &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, u_strcmp(lll_Skel, dest)); TEST_ASSERT_EQ(u_strlen(lll_Skel), skelLength); skelLength = uspoof_getSkeletonUTF8(sc, USPOOF_ANY_CASE, goodLatinUTF8, -1, (char*)dest, UPRV_LENGTHOF(dest), &status); TEST_ASSERT_SUCCESS(status); skelLength = uspoof_getSkeleton(sc, USPOOF_ANY_CASE, lll_Latin_a, -1, NULL, 0, &status); TEST_ASSERT_EQ(U_BUFFER_OVERFLOW_ERROR, status); TEST_ASSERT_EQ(3, skelLength); status = U_ZERO_ERROR; TEST_TEARDOWN; /* * get Inclusion and Recommended sets */ TEST_SETUP const USet *inclusions = NULL; const USet *recommended = NULL; inclusions = uspoof_getInclusionSet(&status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(TRUE, uset_isFrozen(inclusions)); status = U_ZERO_ERROR; recommended = uspoof_getRecommendedSet(&status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(TRUE, uset_isFrozen(recommended)); TEST_TEARDOWN; }
static int32_t u_scanf_scanset_handler(UFILE *input, u_scanf_spec_info *info, ufmt_args *args, const UChar *fmt, int32_t *fmtConsumed, int32_t *argConverted) { USet *scanset; UErrorCode status = U_ZERO_ERROR; int32_t chLeft = INT32_MAX; UChar32 c; UChar *alias = (UChar*) (args[0].ptrValue); UBool isNotEOF = FALSE; UBool readCharacter = FALSE; /* Create an empty set */ scanset = uset_open(0, -1); /* Back up one to get the [ */ fmt--; /* truncate to the width, if specified and alias the target */ if(info->fWidth >= 0) { chLeft = info->fWidth; } /* parse the scanset from the fmt string */ *fmtConsumed = uset_applyPattern(scanset, fmt, -1, 0, &status); /* verify that the parse was successful */ if (U_SUCCESS(status)) { c=0; /* grab characters one at a time and make sure they are in the scanset */ while(chLeft > 0) { if ((isNotEOF = ufile_getch32(input, &c)) && uset_contains(scanset, c)) { readCharacter = TRUE; if (!info->fSkipArg) { int32_t idx = 0; UBool isError = FALSE; U16_APPEND(alias, idx, chLeft, c, isError); if (isError) { break; } alias += idx; } chLeft -= (1 + U_IS_SUPPLEMENTARY(c)); } else { /* if the character's not in the scanset, break out */ break; } } /* put the final character we read back on the input */ if(isNotEOF && chLeft > 0) { u_fungetc(c, input); } } uset_close(scanset); /* if we didn't match at least 1 character, fail */ if(!readCharacter) return -1; /* otherwise, add the terminator */ else if (!info->fSkipArg) { *alias = 0x00; } /* we converted 1 arg */ *argConverted = !info->fSkipArg; return (info->fWidth >= 0 ? info->fWidth : INT32_MAX) - chLeft; }
// --------------------------------------------------------------------------- // RangeToken: Getter methods // --------------------------------------------------------------------------- RangeToken* RangeToken::getCaseInsensitiveToken(TokenFactory* const tokFactory) { if (fCaseIToken == 0 && tokFactory && fRanges) { bool isNRange = (getTokenType() == T_NRANGE) ? true : false; RangeToken* lwrToken = tokFactory->createRange(isNRange); #if XERCES_USE_TRANSCODER_ICU && ((U_ICU_VERSION_MAJOR_NUM > 2) || (U_ICU_VERSION_MAJOR_NUM == 2 && U_ICU_VERSION_MINOR_NUM >=4)) UChar* rangeStr=(UChar*)fMemoryManager->allocate(40*fElemCount*sizeof(UChar)); ArrayJanitor<UChar> janRange(rangeStr, fMemoryManager); int c=0; rangeStr[c++] = chOpenSquare; for (unsigned int i = 0; i < fElemCount - 1; i += 2) { XMLCh buffer[10]; XMLSize_t len, j; rangeStr[c++] = chBackSlash; rangeStr[c++] = chLatin_U; XMLString::binToText(fRanges[i], buffer, 10, 16, fMemoryManager); len = XMLString::stringLen(buffer); for(j=0;j<(8-len);j++) rangeStr[c++] = chDigit_0; XMLCh* p=buffer; while(*p) rangeStr[c++] = *p++; if(fRanges[i+1]!=fRanges[i]) { rangeStr[c++] = chDash; rangeStr[c++] = chBackSlash; rangeStr[c++] = chLatin_U; XMLString::binToText(fRanges[i+1], buffer, 10, 16, fMemoryManager); len = XMLString::stringLen(buffer); for(j=0;j<(8-len);j++) rangeStr[c++] = chDigit_0; p=buffer; while(*p) rangeStr[c++] = *p++; } } rangeStr[c++] = chCloseSquare; rangeStr[c++] = chNull; UErrorCode ec=U_ZERO_ERROR; USet* range=uset_openPatternOptions(rangeStr, -1, USET_CASE_INSENSITIVE, &ec); if(range) { ec = U_ZERO_ERROR; uint32_t cbCount=uset_serialize(range, NULL, 0, &ec); uint16_t* buffer=(uint16_t*)fMemoryManager->allocate(cbCount*sizeof(uint16_t)); ArrayJanitor<uint16_t> janSet(buffer, fMemoryManager); ec = U_ZERO_ERROR; uset_serialize(range, buffer, cbCount, &ec); USerializedSet serializedSet; uset_getSerializedSet(&serializedSet, buffer, cbCount); int32_t nSets=uset_getSerializedRangeCount(&serializedSet); for(int32_t i=0; i<nSets; i++) { UChar32 start, end; uset_getSerializedRange(&serializedSet, i, &start, &end); lwrToken->addRange(start, end); } // does this release the memory allocated by the set? uset_setSerializedToOne(&serializedSet, 32); uset_close(range); } #else unsigned int exceptIndex = 0; for (unsigned int i = 0; i < fElemCount - 1; i += 2) { for (XMLInt32 ch = fRanges[i]; ch <= fRanges[i + 1]; ++ch) { #if XERCES_USE_TRANSCODER_ICU const XMLInt32 upperCh = u_toupper(ch); if (upperCh != ch) { lwrToken->addRange(upperCh, upperCh); } const XMLInt32 lowerCh = u_tolower(ch); if (lowerCh != ch) { lwrToken->addRange(lowerCh, lowerCh); } const XMLInt32 titleCh = u_totitle(ch); if (titleCh != ch && titleCh != upperCh) { lwrToken->addRange(titleCh, titleCh); } #else if (ch >= chLatin_A && ch <= chLatin_Z) { ch += chLatin_a - chLatin_A; lwrToken->addRange(ch, ch); } else if (ch >= chLatin_a && ch <= chLatin_z) { ch -= chLatin_a - chLatin_A; lwrToken->addRange(ch, ch); } #endif const unsigned int exceptionsSize = sizeof(s_exceptions) / sizeof(s_exceptions[0]); // Add any exception chars. These are characters where the the // case mapping is not symmetric. (Unicode case mappings are not isomorphic...) while (exceptIndex < exceptionsSize) { if (s_exceptions[exceptIndex].baseChar < ch) { ++exceptIndex; } else if (s_exceptions[exceptIndex].baseChar == ch) { const XMLInt32 matchingChar = s_exceptions[exceptIndex].matchingChar; lwrToken->addRange( matchingChar, matchingChar); ++exceptIndex; } else { break; } } } } lwrToken->mergeRanges(this); #endif lwrToken->compactRanges(); lwrToken->createMap(); fCaseIToken = lwrToken; // TODO(dbertoni) This is a temporary hack until we can change the ABI. // See Jira issue XERCESC-1866 for more details. // Overload the fCaseIToken data member to be the case-insensitive token // that's caching the case-insensitive one. We need this because tokens // have varying lifetimes. fCaseIToken->setCaseInsensitiveToken(this); } return fCaseIToken; }