Beispiel #1
0
U_CDECL_BEGIN

// USetAdder implementation
// Does not use uset.h to reduce code dependencies
static void U_CALLCONV
_set_add(USet *set, UChar32 c) {
    uset_add(set, c);
}
Beispiel #2
0
static void
_set_addAll(USet *set, const UChar *s, int32_t length) {
    UChar32 c;
    int32_t i;

    /* needs length>=0 */
    for(i=0; i<length; /* U16_NEXT advances i */) {
        U16_NEXT(s, i, length, c);
        uset_add(set, c);
    }
}
Beispiel #3
0
static void
parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
    char *fields[5][2];
    int32_t i, j;

    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return;
    }

    u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode);

    /* sort the special casing entries by code point */
    if(specialCasingCount>0) {
        uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
                       compareSpecialCasings, NULL, FALSE, pErrorCode);
    }
    if(U_FAILURE(*pErrorCode)) {
        return;
    }

    /* replace multiple entries for any code point by one "complex" one */
    j=0;
    for(i=1; i<specialCasingCount; ++i) {
        if(specialCasings[i-1].code==specialCasings[i].code) {
            /* there is a duplicate code point */
            specialCasings[i-1].code=0x7fffffff;    /* remove this entry in the following sorting */
            specialCasings[i].isComplex=TRUE;       /* make the following one complex */
            specialCasings[i].lowerCase[0]=0;
            specialCasings[i].upperCase[0]=0;
            specialCasings[i].titleCase[0]=0;
            ++j;
        }
    }

    /* if some entries just were removed, then re-sort */
    if(j>0) {
        uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
                       compareSpecialCasings, NULL, FALSE, pErrorCode);
        specialCasingCount-=j;
    }
    if(U_FAILURE(*pErrorCode)) {
        return;
    }

    /*
     * Add one complex mapping to caseSensitive that was filtered out above:
     * Greek final Sigma has a conditional mapping but not locale-sensitive,
     * and it is taken when lowercasing just U+03A3 alone.
     * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
     */
    uset_add(caseSensitive, 0x3c2);
}
Beispiel #4
0
static void U_CALLCONV
specialCasingLineFn(void *context,
                    char *fields[][2], int32_t fieldCount,
                    UErrorCode *pErrorCode) {
    char *end;

    /* get code point */
    specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
    end=(char *)u_skipWhitespace(end);
    if(end<=fields[0][0] || end!=fields[0][1]) {
        fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    /* is this a complex mapping? */
    if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
        /* there is some condition text in the fifth field */
        specialCasings[specialCasingCount].isComplex=TRUE;

        /* do not store any actual mappings for this */
        specialCasings[specialCasingCount].lowerCase[0]=0;
        specialCasings[specialCasingCount].upperCase[0]=0;
        specialCasings[specialCasingCount].titleCase[0]=0;
    } else {
        /* just set the "complex" flag and get the case mappings */
        specialCasings[specialCasingCount].isComplex=FALSE;
        specialCasings[specialCasingCount].lowerCase[0]=
            (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode);
        specialCasings[specialCasingCount].upperCase[0]=
            (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode);
        specialCasings[specialCasingCount].titleCase[0]=
            (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode);
        if(U_FAILURE(*pErrorCode)) {
            fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]);
            exit(*pErrorCode);
        }

        uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
        _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
        _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
        _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
    }

    if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
        fprintf(stderr, "gencase: too many special casing mappings\n");
        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
        exit(U_INDEX_OUTOFBOUNDS_ERROR);
    }
}
Beispiel #5
0
U_CAPI int32_t U_EXPORT2
ucol_getUnsafeSet( const UCollator *coll,
                  USet *unsafe,
                  UErrorCode *status)
{
    UChar buffer[internalBufferSize];
    int32_t len = 0;

    uset_clear(unsafe);

    // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant
    static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d,
                                    0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 };

    // add chars that fail the fcd check
    uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status);

    // add Thai/Lao prevowels
    uset_addRange(unsafe, 0xe40, 0xe44);
    uset_addRange(unsafe, 0xec0, 0xec4);
    // add lead/trail surrogates
    uset_addRange(unsafe, 0xd800, 0xdfff);

    USet *contractions = uset_open(0,0);

    int32_t i = 0, j = 0;
    int32_t contsSize = ucol_getContractions(coll, contractions, status);
    UChar32 c = 0;
    // Contraction set consists only of strings
    // to get unsafe code points, we need to
    // break the strings apart and add them to the unsafe set
    for(i = 0; i < contsSize; i++) {
        len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status);
        if(len > 0) {
            j = 0;
            while(j < len) {
                U16_NEXT(buffer, j, len, c);
                if(j < len) {
                    uset_add(unsafe, c);
                }
            }
        }
    }

    uset_close(contractions);

    return uset_size(unsafe);
}
Beispiel #6
0
U_CDECL_BEGIN
static UBool U_CALLCONV
_processSpecials(const void *context, UChar32 start, UChar32 limit, uint32_t CE)
{
    UErrorCode *status = ((contContext *)context)->status;
    USet *expansions = ((contContext *)context)->expansions;
    USet *removed = ((contContext *)context)->removedContractions;
    UBool addPrefixes = ((contContext *)context)->addPrefixes;
    UChar contraction[internalBufferSize];
    if(isSpecial(CE)) {
      if(((getCETag(CE) == SPEC_PROC_TAG && addPrefixes) || getCETag(CE) == CONTRACTION_TAG)) {
        while(start < limit && U_SUCCESS(*status)) {
            // if there are suppressed contractions, we don't
            // want to add them.
            if(removed && uset_contains(removed, start)) {
                start++;
                continue;
            }
            // we start our contraction from middle, since we don't know if it
            // will grow toward right or left
            contraction[internalBufferSize/2] = (UChar)start;
            addSpecial(((contContext *)context), contraction, internalBufferSize, CE, internalBufferSize/2, internalBufferSize/2+1, status);
            start++;
        }
      } else if(expansions && getCETag(CE) == EXPANSION_TAG) {
        while(start < limit && U_SUCCESS(*status)) {
          uset_add(expansions, start++);
        }
      }
    }
    if(U_FAILURE(*status)) {
        return FALSE;
    } else {
        return TRUE;
    }
}
Beispiel #7
0
static void addAllCodePoints(USet* smartSet, const String& string)
{
    for (size_t i = 0; i < string.length(); i++)
        uset_add(smartSet, string[i]);
}
static void addAllCodePoints(USet* smartSet, const String& string)
{
    const UChar* characters = string.characters();
    for (size_t i = 0; i < string.length(); i++)
        uset_add(smartSet, characters[i]);
}
Beispiel #9
0
/**
 * Basic API test for uset.x
 */
static void TestAPI() {
    USet* set;
    USet* set2;
    UErrorCode ec;
    
    /* [] */
    set = uset_openEmpty();
    expect(set, "", "abc{ab}", NULL);
    uset_close(set);

    set = uset_open(1, 0);
    expect(set, "", "abc{ab}", NULL);
    uset_close(set);

    set = uset_open(1, 1);
    uset_clear(set);
    expect(set, "", "abc{ab}", NULL);
    uset_close(set);

    /* [ABC] */
    set = uset_open(0x0041, 0x0043);
    expect(set, "ABC", "DEF{ab}", NULL);
    uset_close(set);

    /* [a-c{ab}] */
    ec = U_ZERO_ERROR;
    set = uset_openPattern(PAT, PAT_LEN, &ec);
    if(U_FAILURE(ec)) {
        log_err("uset_openPattern([a-c{ab}]) failed - %s\n", u_errorName(ec));
        return;
    }
    if(!uset_resemblesPattern(PAT, PAT_LEN, 0)) {
        log_err("uset_resemblesPattern of PAT failed\n");
    }
    expect(set, "abc{ab}", "def{bc}", &ec);

    /* [a-d{ab}] */
    uset_add(set, 0x64);
    expect(set, "abcd{ab}", "ef{bc}", NULL);

    /* [acd{ab}{bc}] */
    uset_remove(set, 0x62);
    uset_addString(set, STR_bc, STR_bc_LEN);
    expect(set, "acd{ab}{bc}", "bef{cd}", NULL);

    /* [acd{bc}] */
    uset_removeString(set, STR_ab, STR_ab_LEN);
    expect(set, "acd{bc}", "bfg{ab}", NULL);

    /* [^acd{bc}] */
    uset_complement(set);
    expect(set, "bef{bc}", "acd{ac}", NULL);

    /* [a-e{bc}] */
    uset_complement(set);
    uset_addRange(set, 0x0062, 0x0065);
    expect(set, "abcde{bc}", "fg{ab}", NULL);

    /* [de{bc}] */
    uset_removeRange(set, 0x0050, 0x0063);
    expect(set, "de{bc}", "bcfg{ab}", NULL);

    /* [g-l] */
    uset_set(set, 0x0067, 0x006C);
    expect(set, "ghijkl", "de{bc}", NULL);

    if (uset_indexOf(set, 0x0067) != 0) {
        log_err("uset_indexOf failed finding correct index of 'g'\n");
    }

    if (uset_charAt(set, 0) != 0x0067) {
        log_err("uset_charAt failed finding correct char 'g' at index 0\n");
    }

    /* How to test this one...? */
    uset_compact(set);

    /* [g-i] */
    uset_retain(set, 0x0067, 0x0069);
    expect(set, "ghi", "dejkl{bc}", NULL);

    /* UCHAR_ASCII_HEX_DIGIT */
    uset_applyIntPropertyValue(set, UCHAR_ASCII_HEX_DIGIT, 1, &ec);
    if(U_FAILURE(ec)) {
        log_err("uset_applyIntPropertyValue([UCHAR_ASCII_HEX_DIGIT]) failed - %s\n", u_errorName(ec));
        return;
    }
    expect(set, "0123456789ABCDEFabcdef", "GHIjkl{bc}", NULL);

    /* [ab] */
    uset_clear(set);
    uset_addAllCodePoints(set, STR_ab, STR_ab_LEN);
    expect(set, "ab", "def{ab}", NULL);
    if (uset_containsAllCodePoints(set, STR_bc, STR_bc_LEN)){
        log_err("set should not conatin all characters of \"bc\" \n");
    }

    /* [] */
    set2 = uset_open(1, 1);
    uset_clear(set2);

    /* space */
    uset_applyPropertyAlias(set2, PAT_lb, PAT_lb_LEN, VAL_SP, VAL_SP_LEN, &ec);
    expect(set2, " ", "abcdefghi{bc}", NULL);

    /* [a-c] */
    uset_set(set2, 0x0061, 0x0063);
    /* [g-i] */
    uset_set(set, 0x0067, 0x0069);

    /* [a-c g-i] */
    if (uset_containsSome(set, set2)) {
        log_err("set should not contain some of set2 yet\n");
    }
    uset_complementAll(set, set2);
    if (!uset_containsSome(set, set2)) {
        log_err("set should contain some of set2\n");
    }
    expect(set, "abcghi", "def{bc}", NULL);

    /* [g-i] */
    uset_removeAll(set, set2);
    expect(set, "ghi", "abcdef{bc}", NULL);

    /* [a-c g-i] */
    uset_addAll(set2, set);
    expect(set2, "abcghi", "def{bc}", NULL);

    /* [g-i] */
    uset_retainAll(set2, set);
    expect(set2, "ghi", "abcdef{bc}", NULL);

    uset_close(set);
    uset_close(set2);
}
Beispiel #10
0
static void U_CALLCONV
unicodeDataLineFn(void *context,
                  char *fields[][2], int32_t fieldCount,
                  UErrorCode *pErrorCode) {
    Props p;
    char *end;
    static UChar32 prevCode=0;
    UChar32 value;
    int32_t i;

    /* reset the properties */
    uprv_memset(&p, 0, sizeof(Props));

    /* get the character code, field 0 */
    p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16);
    if(end<=fields[0][0] || end!=fields[0][1]) {
        fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    /* get general category, field 2 */
    i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
    if(i>=0) {
        p.gc=(uint8_t)i;
    } else {
        fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n",
            fields[2][0], (unsigned long)p.code);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    /* get canonical combining class, field 3 */
    value=(UChar32)uprv_strtoul(fields[3][0], &end, 10);
    if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
        fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }
    p.cc=(uint8_t)value;

    /* get uppercase mapping, field 12 */
    value=(UChar32)uprv_strtoul(fields[12][0], &end, 16);
    if(end!=fields[12][1]) {
        fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n",
            (unsigned long)p.code);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }
    if(value!=0 && value!=p.code) {
        p.upperCase=value;
        uset_add(caseSensitive, p.code);
        uset_add(caseSensitive, value);
    }

    /* get lowercase value, field 13 */
    value=(UChar32)uprv_strtoul(fields[13][0], &end, 16);
    if(end!=fields[13][1]) {
        fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n",
            (unsigned long)p.code);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }
    if(value!=0 && value!=p.code) {
        p.lowerCase=value;
        uset_add(caseSensitive, p.code);
        uset_add(caseSensitive, value);
    }

    /* get titlecase value, field 14 */
    value=(UChar32)uprv_strtoul(fields[14][0], &end, 16);
    if(end!=fields[14][1]) {
        fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n",
            (unsigned long)p.code);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }
    if(value!=0 && value!=p.code) {
        p.titleCase=value;
        uset_add(caseSensitive, p.code);
        uset_add(caseSensitive, value);
    }

    /* set additional properties from previously parsed files */
    if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) {
        p.specialCasing=specialCasings+specialCasingIndex++;
    } else {
        p.specialCasing=NULL;
    }
    if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
        p.caseFolding=caseFoldings+caseFoldingIndex++;

        /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
        if( p.caseFolding->status=='C' &&
            p.caseFolding->simple==p.lowerCase
        ) {
            p.caseFolding=NULL;
        }
    } else {
        p.caseFolding=NULL;
    }

    /* check for non-character code points */
    if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
        fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n",
                (unsigned long)p.code);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    /* check that the code points (p.code) are in ascending order */
    if(p.code<=prevCode && p.code>0) {
        fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
                (unsigned long)p.code, (unsigned long)prevCode);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    /* properties for a single code point */
    setProps(&p);

    prevCode=p.code;
}
Beispiel #11
0
static void U_CALLCONV
caseFoldingLineFn(void *context,
                  char *fields[][2], int32_t fieldCount,
                  UErrorCode *pErrorCode) {
    char *end;
    static UChar32 prevCode=0;
    int32_t count;
    char status;

    /* get code point */
    caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
    end=(char *)u_skipWhitespace(end);
    if(end<=fields[0][0] || end!=fields[0][1]) {
        fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    /* get the status of this mapping */
    caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
    if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') {
        fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
    if(status=='L') {
        return;
    }

    /* get the mapping */
    count=caseFoldings[caseFoldingCount].full[0]=
        (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
        exit(*pErrorCode);
    }

    /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
    if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) {
        caseFoldings[caseFoldingCount].simple=0;
    }

    /* update the case-sensitive set */
    if(status!='T') {
        uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
        _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
    }

    /* check the status */
    if(status=='S') {
        /* check if there was a full mapping for this code point before */
        if( caseFoldingCount>0 &&
            caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
            caseFoldings[caseFoldingCount-1].status=='F'
        ) {
            /* merge the two entries */
            caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
            return;
        }
    } else if(status=='F') {
        /* check if there was a simple mapping for this code point before */
        if( caseFoldingCount>0 &&
            caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
            caseFoldings[caseFoldingCount-1].status=='S'
        ) {
            /* merge the two entries */
            uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
            return;
        }
    } else if(status=='I' || status=='T') {
        /* check if there was a default mapping for this code point before (remove it) */
        while(caseFoldingCount>0 &&
              caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code
        ) {
            prevCode=0;
            --caseFoldingCount;
        }
        /* store only a marker for special handling for cases like dotless i */
        caseFoldings[caseFoldingCount].simple=0;
        caseFoldings[caseFoldingCount].full[0]=0;
    }

    /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
    if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) {
        fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
                (unsigned long)caseFoldings[caseFoldingCount].code,
                (unsigned long)prevCode);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }
    prevCode=caseFoldings[caseFoldingCount].code;

    if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
        fprintf(stderr, "gencase: too many case folding mappings\n");
        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
        exit(U_INDEX_OUTOFBOUNDS_ERROR);
    }
}