U_CDECL_BEGIN // USetAdder implementation // Does not use uset.h to reduce code dependencies static void U_CALLCONV _set_add(USet *set, UChar32 c) { uset_add(set, c); }
static void _set_addAll(USet *set, const UChar *s, int32_t length) { UChar32 c; int32_t i; /* needs length>=0 */ for(i=0; i<length; /* U16_NEXT advances i */) { U16_NEXT(s, i, length, c); uset_add(set, c); } }
static void parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) { char *fields[5][2]; int32_t i, j; if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return; } u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode); /* sort the special casing entries by code point */ if(specialCasingCount>0) { uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing), compareSpecialCasings, NULL, FALSE, pErrorCode); } if(U_FAILURE(*pErrorCode)) { return; } /* replace multiple entries for any code point by one "complex" one */ j=0; for(i=1; i<specialCasingCount; ++i) { if(specialCasings[i-1].code==specialCasings[i].code) { /* there is a duplicate code point */ specialCasings[i-1].code=0x7fffffff; /* remove this entry in the following sorting */ specialCasings[i].isComplex=TRUE; /* make the following one complex */ specialCasings[i].lowerCase[0]=0; specialCasings[i].upperCase[0]=0; specialCasings[i].titleCase[0]=0; ++j; } } /* if some entries just were removed, then re-sort */ if(j>0) { uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing), compareSpecialCasings, NULL, FALSE, pErrorCode); specialCasingCount-=j; } if(U_FAILURE(*pErrorCode)) { return; } /* * Add one complex mapping to caseSensitive that was filtered out above: * Greek final Sigma has a conditional mapping but not locale-sensitive, * and it is taken when lowercasing just U+03A3 alone. * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA */ uset_add(caseSensitive, 0x3c2); }
static void U_CALLCONV specialCasingLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { char *end; /* get code point */ specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); end=(char *)u_skipWhitespace(end); if(end<=fields[0][0] || end!=fields[0][1]) { fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* is this a complex mapping? */ if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') { /* there is some condition text in the fifth field */ specialCasings[specialCasingCount].isComplex=TRUE; /* do not store any actual mappings for this */ specialCasings[specialCasingCount].lowerCase[0]=0; specialCasings[specialCasingCount].upperCase[0]=0; specialCasings[specialCasingCount].titleCase[0]=0; } else { /* just set the "complex" flag and get the case mappings */ specialCasings[specialCasingCount].isComplex=FALSE; specialCasings[specialCasingCount].lowerCase[0]= (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode); specialCasings[specialCasingCount].upperCase[0]= (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode); specialCasings[specialCasingCount].titleCase[0]= (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]); exit(*pErrorCode); } uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code); _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]); _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]); _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]); } if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) { fprintf(stderr, "gencase: too many special casing mappings\n"); *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; exit(U_INDEX_OUTOFBOUNDS_ERROR); } }
U_CAPI int32_t U_EXPORT2 ucol_getUnsafeSet( const UCollator *coll, USet *unsafe, UErrorCode *status) { UChar buffer[internalBufferSize]; int32_t len = 0; uset_clear(unsafe); // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 }; // add chars that fail the fcd check uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status); // add Thai/Lao prevowels uset_addRange(unsafe, 0xe40, 0xe44); uset_addRange(unsafe, 0xec0, 0xec4); // add lead/trail surrogates uset_addRange(unsafe, 0xd800, 0xdfff); USet *contractions = uset_open(0,0); int32_t i = 0, j = 0; int32_t contsSize = ucol_getContractions(coll, contractions, status); UChar32 c = 0; // Contraction set consists only of strings // to get unsafe code points, we need to // break the strings apart and add them to the unsafe set for(i = 0; i < contsSize; i++) { len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status); if(len > 0) { j = 0; while(j < len) { U16_NEXT(buffer, j, len, c); if(j < len) { uset_add(unsafe, c); } } } } uset_close(contractions); return uset_size(unsafe); }
U_CDECL_BEGIN static UBool U_CALLCONV _processSpecials(const void *context, UChar32 start, UChar32 limit, uint32_t CE) { UErrorCode *status = ((contContext *)context)->status; USet *expansions = ((contContext *)context)->expansions; USet *removed = ((contContext *)context)->removedContractions; UBool addPrefixes = ((contContext *)context)->addPrefixes; UChar contraction[internalBufferSize]; if(isSpecial(CE)) { if(((getCETag(CE) == SPEC_PROC_TAG && addPrefixes) || getCETag(CE) == CONTRACTION_TAG)) { while(start < limit && U_SUCCESS(*status)) { // if there are suppressed contractions, we don't // want to add them. if(removed && uset_contains(removed, start)) { start++; continue; } // we start our contraction from middle, since we don't know if it // will grow toward right or left contraction[internalBufferSize/2] = (UChar)start; addSpecial(((contContext *)context), contraction, internalBufferSize, CE, internalBufferSize/2, internalBufferSize/2+1, status); start++; } } else if(expansions && getCETag(CE) == EXPANSION_TAG) { while(start < limit && U_SUCCESS(*status)) { uset_add(expansions, start++); } } } if(U_FAILURE(*status)) { return FALSE; } else { return TRUE; } }
static void addAllCodePoints(USet* smartSet, const String& string) { for (size_t i = 0; i < string.length(); i++) uset_add(smartSet, string[i]); }
static void addAllCodePoints(USet* smartSet, const String& string) { const UChar* characters = string.characters(); for (size_t i = 0; i < string.length(); i++) uset_add(smartSet, characters[i]); }
/** * Basic API test for uset.x */ static void TestAPI() { USet* set; USet* set2; UErrorCode ec; /* [] */ set = uset_openEmpty(); expect(set, "", "abc{ab}", NULL); uset_close(set); set = uset_open(1, 0); expect(set, "", "abc{ab}", NULL); uset_close(set); set = uset_open(1, 1); uset_clear(set); expect(set, "", "abc{ab}", NULL); uset_close(set); /* [ABC] */ set = uset_open(0x0041, 0x0043); expect(set, "ABC", "DEF{ab}", NULL); uset_close(set); /* [a-c{ab}] */ ec = U_ZERO_ERROR; set = uset_openPattern(PAT, PAT_LEN, &ec); if(U_FAILURE(ec)) { log_err("uset_openPattern([a-c{ab}]) failed - %s\n", u_errorName(ec)); return; } if(!uset_resemblesPattern(PAT, PAT_LEN, 0)) { log_err("uset_resemblesPattern of PAT failed\n"); } expect(set, "abc{ab}", "def{bc}", &ec); /* [a-d{ab}] */ uset_add(set, 0x64); expect(set, "abcd{ab}", "ef{bc}", NULL); /* [acd{ab}{bc}] */ uset_remove(set, 0x62); uset_addString(set, STR_bc, STR_bc_LEN); expect(set, "acd{ab}{bc}", "bef{cd}", NULL); /* [acd{bc}] */ uset_removeString(set, STR_ab, STR_ab_LEN); expect(set, "acd{bc}", "bfg{ab}", NULL); /* [^acd{bc}] */ uset_complement(set); expect(set, "bef{bc}", "acd{ac}", NULL); /* [a-e{bc}] */ uset_complement(set); uset_addRange(set, 0x0062, 0x0065); expect(set, "abcde{bc}", "fg{ab}", NULL); /* [de{bc}] */ uset_removeRange(set, 0x0050, 0x0063); expect(set, "de{bc}", "bcfg{ab}", NULL); /* [g-l] */ uset_set(set, 0x0067, 0x006C); expect(set, "ghijkl", "de{bc}", NULL); if (uset_indexOf(set, 0x0067) != 0) { log_err("uset_indexOf failed finding correct index of 'g'\n"); } if (uset_charAt(set, 0) != 0x0067) { log_err("uset_charAt failed finding correct char 'g' at index 0\n"); } /* How to test this one...? */ uset_compact(set); /* [g-i] */ uset_retain(set, 0x0067, 0x0069); expect(set, "ghi", "dejkl{bc}", NULL); /* UCHAR_ASCII_HEX_DIGIT */ uset_applyIntPropertyValue(set, UCHAR_ASCII_HEX_DIGIT, 1, &ec); if(U_FAILURE(ec)) { log_err("uset_applyIntPropertyValue([UCHAR_ASCII_HEX_DIGIT]) failed - %s\n", u_errorName(ec)); return; } expect(set, "0123456789ABCDEFabcdef", "GHIjkl{bc}", NULL); /* [ab] */ uset_clear(set); uset_addAllCodePoints(set, STR_ab, STR_ab_LEN); expect(set, "ab", "def{ab}", NULL); if (uset_containsAllCodePoints(set, STR_bc, STR_bc_LEN)){ log_err("set should not conatin all characters of \"bc\" \n"); } /* [] */ set2 = uset_open(1, 1); uset_clear(set2); /* space */ uset_applyPropertyAlias(set2, PAT_lb, PAT_lb_LEN, VAL_SP, VAL_SP_LEN, &ec); expect(set2, " ", "abcdefghi{bc}", NULL); /* [a-c] */ uset_set(set2, 0x0061, 0x0063); /* [g-i] */ uset_set(set, 0x0067, 0x0069); /* [a-c g-i] */ if (uset_containsSome(set, set2)) { log_err("set should not contain some of set2 yet\n"); } uset_complementAll(set, set2); if (!uset_containsSome(set, set2)) { log_err("set should contain some of set2\n"); } expect(set, "abcghi", "def{bc}", NULL); /* [g-i] */ uset_removeAll(set, set2); expect(set, "ghi", "abcdef{bc}", NULL); /* [a-c g-i] */ uset_addAll(set2, set); expect(set2, "abcghi", "def{bc}", NULL); /* [g-i] */ uset_retainAll(set2, set); expect(set2, "ghi", "abcdef{bc}", NULL); uset_close(set); uset_close(set2); }
static void U_CALLCONV unicodeDataLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { Props p; char *end; static UChar32 prevCode=0; UChar32 value; int32_t i; /* reset the properties */ uprv_memset(&p, 0, sizeof(Props)); /* get the character code, field 0 */ p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16); if(end<=fields[0][0] || end!=fields[0][1]) { fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* get general category, field 2 */ i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]); if(i>=0) { p.gc=(uint8_t)i; } else { fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n", fields[2][0], (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* get canonical combining class, field 3 */ value=(UChar32)uprv_strtoul(fields[3][0], &end, 10); if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) { fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } p.cc=(uint8_t)value; /* get uppercase mapping, field 12 */ value=(UChar32)uprv_strtoul(fields[12][0], &end, 16); if(end!=fields[12][1]) { fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n", (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } if(value!=0 && value!=p.code) { p.upperCase=value; uset_add(caseSensitive, p.code); uset_add(caseSensitive, value); } /* get lowercase value, field 13 */ value=(UChar32)uprv_strtoul(fields[13][0], &end, 16); if(end!=fields[13][1]) { fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n", (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } if(value!=0 && value!=p.code) { p.lowerCase=value; uset_add(caseSensitive, p.code); uset_add(caseSensitive, value); } /* get titlecase value, field 14 */ value=(UChar32)uprv_strtoul(fields[14][0], &end, 16); if(end!=fields[14][1]) { fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n", (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } if(value!=0 && value!=p.code) { p.titleCase=value; uset_add(caseSensitive, p.code); uset_add(caseSensitive, value); } /* set additional properties from previously parsed files */ if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) { p.specialCasing=specialCasings+specialCasingIndex++; } else { p.specialCasing=NULL; } if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) { p.caseFolding=caseFoldings+caseFoldingIndex++; /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */ if( p.caseFolding->status=='C' && p.caseFolding->simple==p.lowerCase ) { p.caseFolding=NULL; } } else { p.caseFolding=NULL; } /* check for non-character code points */ if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) { fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n", (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* check that the code points (p.code) are in ascending order */ if(p.code<=prevCode && p.code>0) { fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n", (unsigned long)p.code, (unsigned long)prevCode); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* properties for a single code point */ setProps(&p); prevCode=p.code; }
static void U_CALLCONV caseFoldingLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { char *end; static UChar32 prevCode=0; int32_t count; char status; /* get code point */ caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); end=(char *)u_skipWhitespace(end); if(end<=fields[0][0] || end!=fields[0][1]) { fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* get the status of this mapping */ caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]); if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') { fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */ if(status=='L') { return; } /* get the mapping */ count=caseFoldings[caseFoldingCount].full[0]= (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]); exit(*pErrorCode); } /* there is a simple mapping only if there is exactly one code point (count is in UChars) */ if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) { caseFoldings[caseFoldingCount].simple=0; } /* update the case-sensitive set */ if(status!='T') { uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code); _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]); } /* check the status */ if(status=='S') { /* check if there was a full mapping for this code point before */ if( caseFoldingCount>0 && caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && caseFoldings[caseFoldingCount-1].status=='F' ) { /* merge the two entries */ caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple; return; } } else if(status=='F') { /* check if there was a simple mapping for this code point before */ if( caseFoldingCount>0 && caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && caseFoldings[caseFoldingCount-1].status=='S' ) { /* merge the two entries */ uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR); return; } } else if(status=='I' || status=='T') { /* check if there was a default mapping for this code point before (remove it) */ while(caseFoldingCount>0 && caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code ) { prevCode=0; --caseFoldingCount; } /* store only a marker for special handling for cases like dotless i */ caseFoldings[caseFoldingCount].simple=0; caseFoldings[caseFoldingCount].full[0]=0; } /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */ if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) { fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n", (unsigned long)caseFoldings[caseFoldingCount].code, (unsigned long)prevCode); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } prevCode=caseFoldings[caseFoldingCount].code; if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) { fprintf(stderr, "gencase: too many case folding mappings\n"); *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; exit(U_INDEX_OUTOFBOUNDS_ERROR); } }