static void _LMBCSGetUnicodeSet(const UConverter *cnv, USet *set, UConverterUnicodeSet which, UErrorCode *pErrorCode) { /* all but U+F6xx, see LMBCS explanation above (search for F6xx) */ uset_addRange(set, 0, 0xf5ff); uset_addRange(set, 0xf700, 0x10ffff); }
static void TestFreezable() { USet *idSet; USet *frozen; USet *thawed; idSet=openIDSet(); if (idSet == NULL) { log_data_err("openIDSet() returned NULL. (Are you missing data?)\n"); uset_close(idSet); return; } frozen=uset_clone(idSet); if (frozen == NULL) { log_err("uset_Clone() returned NULL\n"); return; } if(!uset_equals(frozen, idSet)) { log_err("uset_clone() did not make an equal copy\n"); } uset_freeze(frozen); uset_addRange(frozen, 0xd802, 0xd805); if(uset_isFrozen(idSet) || !uset_isFrozen(frozen) || !uset_equals(frozen, idSet)) { log_err("uset_freeze() or uset_isFrozen() does not work\n"); } thawed=uset_cloneAsThawed(frozen); if (thawed == NULL) { log_err("uset_cloneAsThawed(frozen) returned NULL"); uset_close(frozen); uset_close(idSet); return; } uset_addRange(thawed, 0xd802, 0xd805); if(uset_isFrozen(thawed) || uset_equals(thawed, idSet) || !uset_containsRange(thawed, 0xd802, 0xd805)) { log_err("uset_cloneAsThawed() does not work\n"); } uset_close(idSet); uset_close(frozen); uset_close(thawed); }
U_CAPI int32_t U_EXPORT2 ucol_getUnsafeSet( const UCollator *coll, USet *unsafe, UErrorCode *status) { UChar buffer[internalBufferSize]; int32_t len = 0; uset_clear(unsafe); // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 }; // add chars that fail the fcd check uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status); // add Thai/Lao prevowels uset_addRange(unsafe, 0xe40, 0xe44); uset_addRange(unsafe, 0xec0, 0xec4); // add lead/trail surrogates uset_addRange(unsafe, 0xd800, 0xdfff); USet *contractions = uset_open(0,0); int32_t i = 0, j = 0; int32_t contsSize = ucol_getContractions(coll, contractions, status); UChar32 c = 0; // Contraction set consists only of strings // to get unsafe code points, we need to // break the strings apart and add them to the unsafe set for(i = 0; i < contsSize; i++) { len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status); if(len > 0) { j = 0; while(j < len) { U16_NEXT(buffer, j, len, c); if(j < len) { uset_add(unsafe, c); } } } } uset_close(contractions); return uset_size(unsafe); }
// This is mostly a port of the code in WebCore/editing/SmartReplaceCF.cpp // except we use ICU instead of CoreFoundation character classes. static USet* getSmartSet(bool isPreviousCharacter) { static USet* preSmartSet = nullptr; static USet* postSmartSet = nullptr; USet* smartSet = isPreviousCharacter ? preSmartSet : postSmartSet; if (!smartSet) { // Whitespace and newline (kCFCharacterSetWhitespaceAndNewline) UErrorCode ec = U_ZERO_ERROR; String whitespaceAndNewline = ASCIILiteral("[[:WSpace:] [\\u000A\\u000B\\u000C\\u000D\\u0085]]"); smartSet = uset_openPattern(StringView(whitespaceAndNewline).upconvertedCharacters(), whitespaceAndNewline.length(), &ec); ASSERT(U_SUCCESS(ec)); // CJK ranges // FIXME: Looks like all these ranges include one extra character past the end. uset_addRange(smartSet, 0x1100, 0x1100 + 256); // Hangul Jamo (0x1100 - 0x11FF) uset_addRange(smartSet, 0x2E80, 0x2E80 + 352); // CJK & Kangxi Radicals (0x2E80 - 0x2FDF) uset_addRange(smartSet, 0x2FF0, 0x2FF0 + 464); // Ideograph Descriptions, CJK Symbols, Hiragana, Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun, & Bopomofo Ext (0x2FF0 - 0x31BF) uset_addRange(smartSet, 0x3200, 0x3200 + 29392); // Enclosed CJK, CJK Ideographs (Uni Han & Ext A), & Yi (0x3200 - 0xA4CF) uset_addRange(smartSet, 0xAC00, 0xAC00 + 11183); // Hangul Syllables (0xAC00 - 0xD7AF) uset_addRange(smartSet, 0xF900, 0xF900 + 352); // CJK Compatibility Ideographs (0xF900 - 0xFA5F) uset_addRange(smartSet, 0xFE30, 0xFE30 + 32); // CJK Compatibility From (0xFE30 - 0xFE4F) uset_addRange(smartSet, 0xFF00, 0xFF00 + 240); // Half/Full Width Form (0xFF00 - 0xFFEF) uset_addRange(smartSet, 0x20000, 0x20000 + 0xA6D7); // CJK Ideograph Exntension B uset_addRange(smartSet, 0x2F800, 0x2F800 + 0x021E); // CJK Compatibility Ideographs (0x2F800 - 0x2FA1D) if (isPreviousCharacter) { // FIXME: Silly to convert this to a WTF::String just to loop through the characters. addAllCodePoints(smartSet, ASCIILiteral("([\"\'#$/-`{")); preSmartSet = smartSet; } else { // FIXME: Silly to convert this to a WTF::String just to loop through the characters. addAllCodePoints(smartSet, ASCIILiteral(")].,;:?\'!\"%*-/}")); // Punctuation (kCFCharacterSetPunctuation) UErrorCode ec = U_ZERO_ERROR; String punctuationClass = ASCIILiteral("[:P:]"); USet* icuPunct = uset_openPattern(StringView(punctuationClass).upconvertedCharacters(), punctuationClass.length(), &ec); ASSERT(U_SUCCESS(ec)); uset_addAll(smartSet, icuPunct); uset_close(icuPunct); postSmartSet = smartSet; } } return smartSet; }
// This is mostly a port of the code in WebCore/editing/SmartReplaceCF.cpp // except we use icu in place of CoreFoundations character classes. static USet* getSmartSet(bool isPreviousCharacter) { static USet* preSmartSet = nullptr; static USet* postSmartSet = nullptr; USet* smartSet = isPreviousCharacter ? preSmartSet : postSmartSet; if (!smartSet) { // Whitespace and newline (kCFCharacterSetWhitespaceAndNewline) UErrorCode ec = U_ZERO_ERROR; String whitespaceAndNewline("[[:WSpace:] [\\u000A\\u000B\\u000C\\u000D\\u0085]]"); smartSet = uset_openPattern(whitespaceAndNewline.charactersWithNullTermination().data(), whitespaceAndNewline.length(), &ec); ASSERT(U_SUCCESS(ec)); // CJK ranges uset_addRange(smartSet, 0x1100, 0x1100 + 256); // Hangul Jamo (0x1100 - 0x11FF) uset_addRange(smartSet, 0x2E80, 0x2E80 + 352); // CJK & Kangxi Radicals (0x2E80 - 0x2FDF) uset_addRange(smartSet, 0x2FF0, 0x2FF0 + 464); // Ideograph Descriptions, CJK Symbols, Hiragana, Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun, & Bopomofo Ext (0x2FF0 - 0x31BF) uset_addRange(smartSet, 0x3200, 0x3200 + 29392); // Enclosed CJK, CJK Ideographs (Uni Han & Ext A), & Yi (0x3200 - 0xA4CF) uset_addRange(smartSet, 0xAC00, 0xAC00 + 11183); // Hangul Syllables (0xAC00 - 0xD7AF) uset_addRange(smartSet, 0xF900, 0xF900 + 352); // CJK Compatibility Ideographs (0xF900 - 0xFA5F) uset_addRange(smartSet, 0xFE30, 0xFE30 + 32); // CJK Compatibility From (0xFE30 - 0xFE4F) uset_addRange(smartSet, 0xFF00, 0xFF00 + 240); // Half/Full Width Form (0xFF00 - 0xFFEF) uset_addRange(smartSet, 0x20000, 0x20000 + 0xA6D7); // CJK Ideograph Exntension B uset_addRange(smartSet, 0x2F800, 0x2F800 + 0x021E); // CJK Compatibility Ideographs (0x2F800 - 0x2FA1D) if (isPreviousCharacter) { addAllCodePoints(smartSet, "([\"\'#$/-`{"); preSmartSet = smartSet; } else { addAllCodePoints(smartSet, ")].,;:?\'!\"%*-/}"); // Punctuation (kCFCharacterSetPunctuation) UErrorCode ec = U_ZERO_ERROR; String punctuationClass("[:P:]"); USet* icuPunct = uset_openPattern(punctuationClass.charactersWithNullTermination().data(), punctuationClass.length(), &ec); ASSERT(U_SUCCESS(ec)); uset_addAll(smartSet, icuPunct); uset_close(icuPunct); postSmartSet = smartSet; } } return smartSet; }
/** * Basic API test for uset.x */ static void TestAPI() { USet* set; USet* set2; UErrorCode ec; /* [] */ set = uset_openEmpty(); expect(set, "", "abc{ab}", NULL); uset_close(set); set = uset_open(1, 0); expect(set, "", "abc{ab}", NULL); uset_close(set); set = uset_open(1, 1); uset_clear(set); expect(set, "", "abc{ab}", NULL); uset_close(set); /* [ABC] */ set = uset_open(0x0041, 0x0043); expect(set, "ABC", "DEF{ab}", NULL); uset_close(set); /* [a-c{ab}] */ ec = U_ZERO_ERROR; set = uset_openPattern(PAT, PAT_LEN, &ec); if(U_FAILURE(ec)) { log_err("uset_openPattern([a-c{ab}]) failed - %s\n", u_errorName(ec)); return; } if(!uset_resemblesPattern(PAT, PAT_LEN, 0)) { log_err("uset_resemblesPattern of PAT failed\n"); } expect(set, "abc{ab}", "def{bc}", &ec); /* [a-d{ab}] */ uset_add(set, 0x64); expect(set, "abcd{ab}", "ef{bc}", NULL); /* [acd{ab}{bc}] */ uset_remove(set, 0x62); uset_addString(set, STR_bc, STR_bc_LEN); expect(set, "acd{ab}{bc}", "bef{cd}", NULL); /* [acd{bc}] */ uset_removeString(set, STR_ab, STR_ab_LEN); expect(set, "acd{bc}", "bfg{ab}", NULL); /* [^acd{bc}] */ uset_complement(set); expect(set, "bef{bc}", "acd{ac}", NULL); /* [a-e{bc}] */ uset_complement(set); uset_addRange(set, 0x0062, 0x0065); expect(set, "abcde{bc}", "fg{ab}", NULL); /* [de{bc}] */ uset_removeRange(set, 0x0050, 0x0063); expect(set, "de{bc}", "bcfg{ab}", NULL); /* [g-l] */ uset_set(set, 0x0067, 0x006C); expect(set, "ghijkl", "de{bc}", NULL); if (uset_indexOf(set, 0x0067) != 0) { log_err("uset_indexOf failed finding correct index of 'g'\n"); } if (uset_charAt(set, 0) != 0x0067) { log_err("uset_charAt failed finding correct char 'g' at index 0\n"); } /* How to test this one...? */ uset_compact(set); /* [g-i] */ uset_retain(set, 0x0067, 0x0069); expect(set, "ghi", "dejkl{bc}", NULL); /* UCHAR_ASCII_HEX_DIGIT */ uset_applyIntPropertyValue(set, UCHAR_ASCII_HEX_DIGIT, 1, &ec); if(U_FAILURE(ec)) { log_err("uset_applyIntPropertyValue([UCHAR_ASCII_HEX_DIGIT]) failed - %s\n", u_errorName(ec)); return; } expect(set, "0123456789ABCDEFabcdef", "GHIjkl{bc}", NULL); /* [ab] */ uset_clear(set); uset_addAllCodePoints(set, STR_ab, STR_ab_LEN); expect(set, "ab", "def{ab}", NULL); if (uset_containsAllCodePoints(set, STR_bc, STR_bc_LEN)){ log_err("set should not conatin all characters of \"bc\" \n"); } /* [] */ set2 = uset_open(1, 1); uset_clear(set2); /* space */ uset_applyPropertyAlias(set2, PAT_lb, PAT_lb_LEN, VAL_SP, VAL_SP_LEN, &ec); expect(set2, " ", "abcdefghi{bc}", NULL); /* [a-c] */ uset_set(set2, 0x0061, 0x0063); /* [g-i] */ uset_set(set, 0x0067, 0x0069); /* [a-c g-i] */ if (uset_containsSome(set, set2)) { log_err("set should not contain some of set2 yet\n"); } uset_complementAll(set, set2); if (!uset_containsSome(set, set2)) { log_err("set should contain some of set2\n"); } expect(set, "abcghi", "def{bc}", NULL); /* [g-i] */ uset_removeAll(set, set2); expect(set, "ghi", "abcdef{bc}", NULL); /* [a-c g-i] */ uset_addAll(set2, set); expect(set2, "abcghi", "def{bc}", NULL); /* [g-i] */ uset_retainAll(set2, set); expect(set2, "ghi", "abcdef{bc}", NULL); uset_close(set); uset_close(set2); }