// This is mostly a port of the code in WebCore/editing/SmartReplaceCF.cpp // except we use ICU instead of CoreFoundation character classes. static USet* getSmartSet(bool isPreviousCharacter) { static USet* preSmartSet = nullptr; static USet* postSmartSet = nullptr; USet* smartSet = isPreviousCharacter ? preSmartSet : postSmartSet; if (!smartSet) { // Whitespace and newline (kCFCharacterSetWhitespaceAndNewline) UErrorCode ec = U_ZERO_ERROR; String whitespaceAndNewline = ASCIILiteral("[[:WSpace:] [\\u000A\\u000B\\u000C\\u000D\\u0085]]"); smartSet = uset_openPattern(StringView(whitespaceAndNewline).upconvertedCharacters(), whitespaceAndNewline.length(), &ec); ASSERT(U_SUCCESS(ec)); // CJK ranges // FIXME: Looks like all these ranges include one extra character past the end. uset_addRange(smartSet, 0x1100, 0x1100 + 256); // Hangul Jamo (0x1100 - 0x11FF) uset_addRange(smartSet, 0x2E80, 0x2E80 + 352); // CJK & Kangxi Radicals (0x2E80 - 0x2FDF) uset_addRange(smartSet, 0x2FF0, 0x2FF0 + 464); // Ideograph Descriptions, CJK Symbols, Hiragana, Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun, & Bopomofo Ext (0x2FF0 - 0x31BF) uset_addRange(smartSet, 0x3200, 0x3200 + 29392); // Enclosed CJK, CJK Ideographs (Uni Han & Ext A), & Yi (0x3200 - 0xA4CF) uset_addRange(smartSet, 0xAC00, 0xAC00 + 11183); // Hangul Syllables (0xAC00 - 0xD7AF) uset_addRange(smartSet, 0xF900, 0xF900 + 352); // CJK Compatibility Ideographs (0xF900 - 0xFA5F) uset_addRange(smartSet, 0xFE30, 0xFE30 + 32); // CJK Compatibility From (0xFE30 - 0xFE4F) uset_addRange(smartSet, 0xFF00, 0xFF00 + 240); // Half/Full Width Form (0xFF00 - 0xFFEF) uset_addRange(smartSet, 0x20000, 0x20000 + 0xA6D7); // CJK Ideograph Exntension B uset_addRange(smartSet, 0x2F800, 0x2F800 + 0x021E); // CJK Compatibility Ideographs (0x2F800 - 0x2FA1D) if (isPreviousCharacter) { // FIXME: Silly to convert this to a WTF::String just to loop through the characters. addAllCodePoints(smartSet, ASCIILiteral("([\"\'#$/-`{")); preSmartSet = smartSet; } else { // FIXME: Silly to convert this to a WTF::String just to loop through the characters. addAllCodePoints(smartSet, ASCIILiteral(")].,;:?\'!\"%*-/}")); // Punctuation (kCFCharacterSetPunctuation) UErrorCode ec = U_ZERO_ERROR; String punctuationClass = ASCIILiteral("[:P:]"); USet* icuPunct = uset_openPattern(StringView(punctuationClass).upconvertedCharacters(), punctuationClass.length(), &ec); ASSERT(U_SUCCESS(ec)); uset_addAll(smartSet, icuPunct); uset_close(icuPunct); postSmartSet = smartSet; } } return smartSet; }
// This is mostly a port of the code in WebCore/editing/SmartReplaceCF.cpp // except we use icu in place of CoreFoundations character classes. static USet* getSmartSet(bool isPreviousCharacter) { static USet* preSmartSet = nullptr; static USet* postSmartSet = nullptr; USet* smartSet = isPreviousCharacter ? preSmartSet : postSmartSet; if (!smartSet) { // Whitespace and newline (kCFCharacterSetWhitespaceAndNewline) UErrorCode ec = U_ZERO_ERROR; String whitespaceAndNewline("[[:WSpace:] [\\u000A\\u000B\\u000C\\u000D\\u0085]]"); smartSet = uset_openPattern(whitespaceAndNewline.charactersWithNullTermination().data(), whitespaceAndNewline.length(), &ec); ASSERT(U_SUCCESS(ec)); // CJK ranges uset_addRange(smartSet, 0x1100, 0x1100 + 256); // Hangul Jamo (0x1100 - 0x11FF) uset_addRange(smartSet, 0x2E80, 0x2E80 + 352); // CJK & Kangxi Radicals (0x2E80 - 0x2FDF) uset_addRange(smartSet, 0x2FF0, 0x2FF0 + 464); // Ideograph Descriptions, CJK Symbols, Hiragana, Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun, & Bopomofo Ext (0x2FF0 - 0x31BF) uset_addRange(smartSet, 0x3200, 0x3200 + 29392); // Enclosed CJK, CJK Ideographs (Uni Han & Ext A), & Yi (0x3200 - 0xA4CF) uset_addRange(smartSet, 0xAC00, 0xAC00 + 11183); // Hangul Syllables (0xAC00 - 0xD7AF) uset_addRange(smartSet, 0xF900, 0xF900 + 352); // CJK Compatibility Ideographs (0xF900 - 0xFA5F) uset_addRange(smartSet, 0xFE30, 0xFE30 + 32); // CJK Compatibility From (0xFE30 - 0xFE4F) uset_addRange(smartSet, 0xFF00, 0xFF00 + 240); // Half/Full Width Form (0xFF00 - 0xFFEF) uset_addRange(smartSet, 0x20000, 0x20000 + 0xA6D7); // CJK Ideograph Exntension B uset_addRange(smartSet, 0x2F800, 0x2F800 + 0x021E); // CJK Compatibility Ideographs (0x2F800 - 0x2FA1D) if (isPreviousCharacter) { addAllCodePoints(smartSet, "([\"\'#$/-`{"); preSmartSet = smartSet; } else { addAllCodePoints(smartSet, ")].,;:?\'!\"%*-/}"); // Punctuation (kCFCharacterSetPunctuation) UErrorCode ec = U_ZERO_ERROR; String punctuationClass("[:P:]"); USet* icuPunct = uset_openPattern(punctuationClass.charactersWithNullTermination().data(), punctuationClass.length(), &ec); ASSERT(U_SUCCESS(ec)); uset_addAll(smartSet, icuPunct); uset_close(icuPunct); postSmartSet = smartSet; } } return smartSet; }
/** * Make sure that when non-invariant chars are passed to uset_openPattern * they do not cause an ugly failure mode (e.g. assertion failure). * JB#3795. */ static void TestNonInvariantPattern() { UErrorCode ec = U_ZERO_ERROR; /* The critical part of this test is that the following pattern must contain a non-invariant character. */ static const char *pattern = "[:ccc!=0:]"; UChar buf[256]; int32_t len = u_unescape(pattern, buf, 256); /* This test 'fails' by having an assertion failure within the following call. It passes by running to completion with no assertion failure. */ USet *set = uset_openPattern(buf, len, &ec); uset_close(set); }
static void TestSerialized() { uint16_t buffer[1000]; USerializedSet sset; USet *set; UErrorCode errorCode; UChar32 c; int32_t length; /* use a pattern that generates both BMP and supplementary code points */ U_STRING_DECL(pattern, "[:Cf:]", 6); U_STRING_INIT(pattern, "[:Cf:]", 6); errorCode=U_ZERO_ERROR; set=uset_openPattern(pattern, -1, &errorCode); if(U_FAILURE(errorCode)) { log_data_err("uset_openPattern([:Cf:]) failed - %s (Are you missing data?)\n", u_errorName(errorCode)); return; } length=uset_serialize(set, buffer, UPRV_LENGTHOF(buffer), &errorCode); if(U_FAILURE(errorCode)) { log_err("unable to uset_serialize([:Cf:]) - %s\n", u_errorName(errorCode)); uset_close(set); return; } uset_getSerializedSet(&sset, buffer, length); for(c=0; c<=0x10ffff; ++c) { if(uset_contains(set, c)!=uset_serializedContains(&sset, c)) { log_err("uset_contains(U+%04x)!=uset_serializedContains(U+%04x)\n", c); break; } } uset_close(set); }
// cpattern is expected to be a class (= surrounded by square brackets) USet *create_set_from_argv(const char *cpattern, UBool negate, error_t **error) { USet *uset; UString *ustr; UErrorCode status; status = U_ZERO_ERROR; if (NULL == (ustr = ustring_convert_argv_from_local(cpattern, error, TRUE))) { return NULL; } uset = uset_openPattern(ustr->ptr, ustr->len, &status); if (U_FAILURE(status)) { ustring_destroy(ustr); icu_error_set(error, FATAL, status, "uset_openPattern"); return NULL; } ustring_destroy(ustr); if (negate) { uset_complement(uset); } uset_freeze(uset); return uset; }
void SSearchTest::monkeyTest(char *params) { // ook! UErrorCode status = U_ZERO_ERROR; //UCollator *coll = ucol_open(NULL, &status); UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status); if (U_FAILURE(status)) { errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status)); return; } CollData *monkeyData = new CollData(coll, status); USet *expansions = uset_openEmpty(); USet *contractions = uset_openEmpty(); ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39); USet *letters = uset_openPattern(letter_pattern, 39, &status); SetMonkey letterMonkey(letters); StringSetMonkey contractionMonkey(contractions, coll, monkeyData); StringSetMonkey expansionMonkey(expansions, coll, monkeyData); UnicodeString testCase; UnicodeString alternate; UnicodeString pattern, altPattern; UnicodeString prefix, altPrefix; UnicodeString suffix, altSuffix; Monkey *monkeys[] = { &letterMonkey, &contractionMonkey, &expansionMonkey, &contractionMonkey, &expansionMonkey, &contractionMonkey, &expansionMonkey, &contractionMonkey, &expansionMonkey}; int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]); // int32_t nonMatchCount = 0; UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY}; const char *strengthNames[] = {"primary", "secondary", "tertiary"}; int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]); int32_t loopCount = quick? 1000 : 10000; int32_t firstStrength = 0; int32_t lastStrength = strengthCount - 1; //*/ 0; if (params != NULL) { #if !UCONFIG_NO_REGULAR_EXPRESSIONS UnicodeString p(params); loopCount = getIntParam("loop", p, loopCount); m_seed = getIntParam("seed", p, m_seed); RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status); if (m.find()) { UnicodeString breakType = m.group(1, status); for (int32_t s = 0; s < strengthCount; s += 1) { if (breakType == strengthNames[s]) { firstStrength = lastStrength = s; break; } } m.reset(); p = m.replaceFirst("", status); } if (RegexMatcher("\\S", p, 0, status).find()) { // Each option is stripped out of the option string as it is processed. // All options have been checked. The option string should have been completely emptied.. char buf[100]; p.extract(buf, sizeof(buf), NULL, status); buf[sizeof(buf)-1] = 0; errln("Unrecognized or extra parameter: %s\n", buf); return; } #else infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters."); #endif } for(int32_t s = firstStrength; s <= lastStrength; s += 1) { int32_t notFoundCount = 0; logln("Setting strength to %s.", strengthNames[s]); ucol_setStrength(coll, strengths[s]); // TODO: try alternate prefix and suffix too? // TODO: alterntaes are only equal at primary strength. Is this OK? for(int32_t t = 0; t < loopCount; t += 1) { uint32_t seed = m_seed; // int32_t nmc = 0; generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern); generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix); generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix); // pattern notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed); testCase.remove(); testCase.append(prefix); testCase.append(/*alt*/pattern); // prefix + pattern notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed); testCase.append(suffix); // prefix + pattern + suffix notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed); testCase.remove(); testCase.append(pattern); testCase.append(suffix); // pattern + suffix notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed); } logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount); } uset_close(contractions); uset_close(expansions); uset_close(letters); delete monkeyData; ucol_close(coll); }
CollData::CollData(UCollator *collator, UErrorCode &status) : coll(NULL), ceToCharsStartingWith(NULL) { // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]] // i.e. other, control, private use, format, surrogate U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20); U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20); USet *charsToTest = uset_openPattern(test_pattern, 20, &status); // Han ext. A, Han, Jamo, Hangul, Han Ext. B // i.e. all the characers we handle implicitly U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status); if (U_FAILURE(status)) { return; } USet *expansions = uset_openEmpty(); USet *contractions = uset_openEmpty(); int32_t itemCount; ceToCharsStartingWith = new CEToStringsMap(status); if (U_FAILURE(status)) { goto bail; } #ifdef CLONE_COLLATOR coll = ucol_safeClone(collator, NULL, NULL, &status); if (U_FAILURE(status)) { goto bail; } #else coll = collator; #endif ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); uset_addAll(charsToTest, contractions); uset_addAll(charsToTest, expansions); uset_removeAll(charsToTest, charsToRemove); itemCount = uset_getItemCount(charsToTest); for(int32_t item = 0; item < itemCount; item += 1) { UChar32 start = 0, end = 0; UChar buffer[16]; int32_t len = uset_getItem(charsToTest, item, &start, &end, buffer, 16, &status); if (len == 0) { for (UChar32 ch = start; ch <= end; ch += 1) { UnicodeString *st = new UnicodeString(ch); if (st == NULL) { status = U_MEMORY_ALLOCATION_ERROR; break; } CEList *ceList = new CEList(coll, *st, status); ceToCharsStartingWith->put(ceList->get(0), st, status); delete ceList; delete st; } } else if (len > 0) { UnicodeString *st = new UnicodeString(buffer, len); if (st == NULL) { status = U_MEMORY_ALLOCATION_ERROR; break; } CEList *ceList = new CEList(coll, *st, status); ceToCharsStartingWith->put(ceList->get(0), st, status); delete ceList; delete st; } else { // shouldn't happen... } if (U_FAILURE(status)) { break; } } bail: uset_close(contractions); uset_close(expansions); uset_close(charsToRemove); uset_close(charsToTest); if (U_FAILURE(status)) { return; } UnicodeSet hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status); if (U_FAILURE(status)) { return; } UnicodeSetIterator hanIter(hanRanges); UnicodeString hanString; while(hanIter.nextRange()) { hanString.append(hanIter.getCodepoint()); hanString.append(hanIter.getCodepointEnd()); } // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9, // but as of Unicode 6.3 the 11xx block is filled, // and there are also more Jamo T at U+D7CB..U+D7FB. // Maybe use [:HST=T:] and look for the end of the last range? // Maybe use script boundary mappings instead of this code?? UChar jamoRanges[] = {Hangul::JAMO_L_BASE, Hangul::JAMO_V_BASE, Hangul::JAMO_T_BASE + 1, 0x11FF}; UnicodeString jamoString(FALSE, jamoRanges, UPRV_LENGTHOF(jamoRanges)); CEList hanList(coll, hanString, status); CEList jamoList(coll, jamoString, status); int32_t j = 0; if (U_FAILURE(status)) { return; } for (int32_t c = 0; c < jamoList.size(); c += 1) { uint32_t jce = jamoList[c]; if (! isContinuation(jce)) { jamoLimits[j++] = jce; } } jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT); minHan = 0xFFFFFFFF; maxHan = 0; for(int32_t h = 0; h < hanList.size(); h += 2) { uint32_t han = (uint32_t) hanList[h]; if (han < minHan) { minHan = han; } if (han > maxHan) { maxHan = han; } } maxHan += (1 << UCOL_PRIMARYORDERSHIFT); }
/** * Basic API test for uset.x */ static void TestAPI() { USet* set; USet* set2; UErrorCode ec; /* [] */ set = uset_openEmpty(); expect(set, "", "abc{ab}", NULL); uset_close(set); set = uset_open(1, 0); expect(set, "", "abc{ab}", NULL); uset_close(set); set = uset_open(1, 1); uset_clear(set); expect(set, "", "abc{ab}", NULL); uset_close(set); /* [ABC] */ set = uset_open(0x0041, 0x0043); expect(set, "ABC", "DEF{ab}", NULL); uset_close(set); /* [a-c{ab}] */ ec = U_ZERO_ERROR; set = uset_openPattern(PAT, PAT_LEN, &ec); if(U_FAILURE(ec)) { log_err("uset_openPattern([a-c{ab}]) failed - %s\n", u_errorName(ec)); return; } if(!uset_resemblesPattern(PAT, PAT_LEN, 0)) { log_err("uset_resemblesPattern of PAT failed\n"); } expect(set, "abc{ab}", "def{bc}", &ec); /* [a-d{ab}] */ uset_add(set, 0x64); expect(set, "abcd{ab}", "ef{bc}", NULL); /* [acd{ab}{bc}] */ uset_remove(set, 0x62); uset_addString(set, STR_bc, STR_bc_LEN); expect(set, "acd{ab}{bc}", "bef{cd}", NULL); /* [acd{bc}] */ uset_removeString(set, STR_ab, STR_ab_LEN); expect(set, "acd{bc}", "bfg{ab}", NULL); /* [^acd{bc}] */ uset_complement(set); expect(set, "bef{bc}", "acd{ac}", NULL); /* [a-e{bc}] */ uset_complement(set); uset_addRange(set, 0x0062, 0x0065); expect(set, "abcde{bc}", "fg{ab}", NULL); /* [de{bc}] */ uset_removeRange(set, 0x0050, 0x0063); expect(set, "de{bc}", "bcfg{ab}", NULL); /* [g-l] */ uset_set(set, 0x0067, 0x006C); expect(set, "ghijkl", "de{bc}", NULL); if (uset_indexOf(set, 0x0067) != 0) { log_err("uset_indexOf failed finding correct index of 'g'\n"); } if (uset_charAt(set, 0) != 0x0067) { log_err("uset_charAt failed finding correct char 'g' at index 0\n"); } /* How to test this one...? */ uset_compact(set); /* [g-i] */ uset_retain(set, 0x0067, 0x0069); expect(set, "ghi", "dejkl{bc}", NULL); /* UCHAR_ASCII_HEX_DIGIT */ uset_applyIntPropertyValue(set, UCHAR_ASCII_HEX_DIGIT, 1, &ec); if(U_FAILURE(ec)) { log_err("uset_applyIntPropertyValue([UCHAR_ASCII_HEX_DIGIT]) failed - %s\n", u_errorName(ec)); return; } expect(set, "0123456789ABCDEFabcdef", "GHIjkl{bc}", NULL); /* [ab] */ uset_clear(set); uset_addAllCodePoints(set, STR_ab, STR_ab_LEN); expect(set, "ab", "def{ab}", NULL); if (uset_containsAllCodePoints(set, STR_bc, STR_bc_LEN)){ log_err("set should not conatin all characters of \"bc\" \n"); } /* [] */ set2 = uset_open(1, 1); uset_clear(set2); /* space */ uset_applyPropertyAlias(set2, PAT_lb, PAT_lb_LEN, VAL_SP, VAL_SP_LEN, &ec); expect(set2, " ", "abcdefghi{bc}", NULL); /* [a-c] */ uset_set(set2, 0x0061, 0x0063); /* [g-i] */ uset_set(set, 0x0067, 0x0069); /* [a-c g-i] */ if (uset_containsSome(set, set2)) { log_err("set should not contain some of set2 yet\n"); } uset_complementAll(set, set2); if (!uset_containsSome(set, set2)) { log_err("set should contain some of set2\n"); } expect(set, "abcghi", "def{bc}", NULL); /* [g-i] */ uset_removeAll(set, set2); expect(set, "ghi", "abcdef{bc}", NULL); /* [a-c g-i] */ uset_addAll(set2, set); expect(set2, "abcghi", "def{bc}", NULL); /* [g-i] */ uset_retainAll(set2, set); expect(set2, "ghi", "abcdef{bc}", NULL); uset_close(set); uset_close(set2); }
static USet *openIDSet() { UErrorCode errorCode = U_ZERO_ERROR; U_STRING_DECL(pattern, "[:ID_Continue:]", 15); U_STRING_INIT(pattern, "[:ID_Continue:]", 15); return uset_openPattern(pattern, 15, &errorCode); }