Ejemplo n.º 1
0
static void
makeUnfoldData() {
    static const UChar
        iDot[2]=        { 0x69, 0x307 };

    UChar *p, *q;
    int32_t i, j, k;
    UErrorCode errorCode;

    /*
     * add a case folding that we missed because it's conditional:
     * 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
     */
    addUnfolding(0x130, iDot, 2);

    /* sort the data */
    errorCode=U_ZERO_ERROR;
    uprv_sortArray(unfold+UGENCASE_UNFOLD_WIDTH, unfoldRows, UGENCASE_UNFOLD_WIDTH*2,
                   compareUnfold, NULL, FALSE, &errorCode);

    /* make unique-string rows by merging adjacent ones' code point columns */

    /* make p point to row i-1 */
    p=(UChar *)unfold+UGENCASE_UNFOLD_WIDTH;

    for(i=1; i<unfoldRows;) {
        if(0==u_memcmp(p, p+UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH)) {
            /* concatenate code point columns */
            q=p+UGENCASE_UNFOLD_STRING_WIDTH;
            for(j=1; j<UGENCASE_UNFOLD_CP_WIDTH && q[j]!=0; ++j) {}
            for(k=0; k<UGENCASE_UNFOLD_CP_WIDTH && q[UGENCASE_UNFOLD_WIDTH+k]!=0; ++j, ++k) {
                q[j]=q[UGENCASE_UNFOLD_WIDTH+k];
            }
            if(j>UGENCASE_UNFOLD_CP_WIDTH) {
                fprintf(stderr, "gencase error: too many code points in unfold[]: %ld>%d=UGENCASE_UNFOLD_CP_WIDTH\n",
                        (long)j, UGENCASE_UNFOLD_CP_WIDTH);
                exit(U_BUFFER_OVERFLOW_ERROR);
            }

            /* move following rows up one */
            --unfoldRows;
            unfoldTop-=UGENCASE_UNFOLD_WIDTH;
            u_memmove(p+UGENCASE_UNFOLD_WIDTH, p+UGENCASE_UNFOLD_WIDTH*2, (unfoldRows-i)*UGENCASE_UNFOLD_WIDTH);
        } else {
            p+=UGENCASE_UNFOLD_WIDTH;
            ++i;
        }
    }

    unfold[UCASE_UNFOLD_ROWS]=(UChar)unfoldRows;

    if(beVerbose) {
        puts("unfold data:");

        p=(UChar *)unfold;
        for(i=0; i<unfoldRows; ++i) {
            p+=UGENCASE_UNFOLD_WIDTH;
            printf("[%2d] %04x %04x %04x <- %04x %04x\n",
                   (int)i, p[0], p[1], p[2], p[3], p[4]);
        }
    }
}
Ejemplo n.º 2
0
void
CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
                           UErrorCode &errorCode) {
    if(U_FAILURE(errorCode) || newValues.containsNone(relevantProps)) { return; }

    UChar32 start=props.start;
    UChar32 end=props.end;

    /* default: map to self */
    int32_t delta=0;

    uint32_t type;
    if(props.binProps[UCHAR_LOWERCASE]) {
        type=UCASE_LOWER;
    } else if(props.binProps[UCHAR_UPPERCASE]) {
        type=UCASE_UPPER;
    } else if(props.getIntProp(UCHAR_GENERAL_CATEGORY)==U_TITLECASE_LETTER) {
        type=UCASE_TITLE;
    } else {
        type=UCASE_NONE;
    }
    uint32_t value=type;

    UBool hasMapping=FALSE;
    if(props.suc>=0) {
        /* uppercase mapping as delta if the character is lowercase */
        hasMapping=TRUE;
        if(type==UCASE_LOWER) {
            delta=props.suc-start;
        } else {
            value|=UCASE_EXCEPTION;
        }
    }
    if(props.slc>=0) {
        /* lowercase mapping as delta if the character is uppercase or titlecase */
        hasMapping=TRUE;
        if(type>=UCASE_UPPER) {
            delta=props.slc-start;
        } else {
            value|=UCASE_EXCEPTION;
        }
    }
    if(props.stc>=0) {
        hasMapping=TRUE;
    }
    if(props.suc!=props.stc) {
        value|=UCASE_EXCEPTION;
    }
    if(!props.lc.isEmpty() || !props.uc.isEmpty() || !props.tc.isEmpty() ||
        newValues.contains(PPUCD_CONDITIONAL_CASE_MAPPINGS)
    ) {
        hasMapping=TRUE;
        value|=UCASE_EXCEPTION;
    }
    if( (props.scf>=0 && props.scf!=props.slc) ||
        (!props.cf.isEmpty() && props.cf!=UnicodeString(props.scf)) ||
        newValues.contains(PPUCD_TURKIC_CASE_FOLDING)
    ) {
        hasMapping=TRUE;
        value|=UCASE_EXCEPTION;
    }

    // Simple case folding falls back to simple lowercasing.
    // If there is no case folding but there is a lowercase mapping,
    // then add a case folding mapping to the code point.
    // For example: Cherokee uppercase syllables since Unicode 8.
    // (Full case folding falls back to simple case folding,
    // not to full lowercasing, so we need not also handle it specially
    // for such cases.)
    UChar32 scf=props.scf;
    if(scf<0 && props.slc>=0) {
        scf=start;
        hasMapping=TRUE;
        value|=UCASE_EXCEPTION;
    }

    if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) {
        value|=UCASE_EXCEPTION;
    }

    if(props.binProps[UCHAR_SOFT_DOTTED]) {
        value|=UCASE_SOFT_DOTTED;
    }
    int32_t cc=props.getIntProp(UCHAR_CANONICAL_COMBINING_CLASS);
    if(cc!=0) {
        if(props.binProps[UCHAR_SOFT_DOTTED]) {
            fprintf(stderr, "genprops error: a soft-dotted character has ccc!=0\n");
            errorCode=U_ILLEGAL_ARGUMENT_ERROR;
            return;
        }
        if(cc==230) {
            value|=UCASE_ABOVE;
        } else {
            value|=UCASE_OTHER_ACCENT;
        }
    }

    if(props.binProps[UCHAR_CASE_IGNORABLE]) {
        value|=UCASE_IGNORABLE;
    }

    if((hasMapping || (value&UCASE_EXCEPTION)) && start!=end) {
        fprintf(stderr,
                "genprops error: range %04lX..%04lX has case mappings "
                "or reasons for data structure exceptions\n",
                (long)start, (long)end);
        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }

    /* handle exceptions */
    if(value&UCASE_EXCEPTION) {
        /* simply store exceptions for later processing and encoding */
        if(excPropsCount==MAX_EXC_COUNT) {
            fprintf(stderr, "genprops error: casepropsbuilder: too many exceptions\n");
            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
            return;
        }
        ExcProps *newExcProps=new ExcProps(props);
        if(newExcProps==NULL) {
            fprintf(stderr,
                    "genprops error: casepropsbuilder out of memory allocating "
                    "exceptions properties\n");
            errorCode=U_MEMORY_ALLOCATION_ERROR;
            return;
        }
        newExcProps->props.scf=scf;
        newExcProps->hasConditionalCaseMappings=newValues.contains(PPUCD_CONDITIONAL_CASE_MAPPINGS);
        newExcProps->hasTurkicCaseFolding=newValues.contains(PPUCD_TURKIC_CASE_FOLDING);
        value|=(uint32_t)excPropsCount<<UGENCASE_EXC_SHIFT;
        excProps[excPropsCount++]=newExcProps;
    } else {
        /* store the simple case mapping delta */
        value|=((uint32_t)delta<<UCASE_DELTA_SHIFT)&UCASE_DELTA_MASK;
    }

    utrie2_setRange32(pTrie, start, end, value, TRUE, &errorCode);
    if(U_FAILURE(errorCode)) {
        fprintf(stderr, "genprops error: unable to set case mapping values: %s\n",
                u_errorName(errorCode));
        return;
    }

    if(hasMapping) {
        /* update the case-sensitive set */
        caseSensitive.add(start);
        if(scf>=0) { caseSensitive.add(scf); }
        if(props.slc>=0) { caseSensitive.add(props.slc); }
        if(props.suc>=0) { caseSensitive.add(props.suc); }
        if(props.stc>=0) { caseSensitive.add(props.stc); }
        caseSensitive.addAll(props.cf);
        caseSensitive.addAll(props.lc);
        caseSensitive.addAll(props.uc);
        caseSensitive.addAll(props.tc);

        /* update maxFullLength */
        if(props.cf.length()>maxFullLength) { maxFullLength=props.cf.length(); }
        if(props.lc.length()>maxFullLength) { maxFullLength=props.lc.length(); }
        if(props.uc.length()>maxFullLength) { maxFullLength=props.uc.length(); }
        if(props.tc.length()>maxFullLength) { maxFullLength=props.tc.length(); }
    }

    /* add the multi-character case folding to the "unfold" data */
    if(props.cf.hasMoreChar32Than(0, 0x7fffffff, 1)) {
        addUnfolding(start, props.cf, errorCode);
    }
}
Ejemplo n.º 3
0
extern void
setProps(Props *p) {
    UErrorCode errorCode;
    uint32_t value, oldValue;
    int32_t delta;
    UBool isCaseIgnorable;

    /* get the non-UnicodeData.txt properties */
    value=oldValue=upvec_getValue(pv, p->code, 0);

    /* default: map to self */
    delta=0;

    if(p->gc==U_TITLECASE_LETTER) {
        /* the Titlecase property is read late, from UnicodeData.txt */
        value|=UCASE_TITLE;
    }

    if(p->upperCase!=0) {
        /* uppercase mapping as delta if the character is lowercase */
        if((value&UCASE_TYPE_MASK)==UCASE_LOWER) {
            delta=p->upperCase-p->code;
        } else {
            value|=UCASE_EXCEPTION;
        }
    }
    if(p->lowerCase!=0) {
        /* lowercase mapping as delta if the character is uppercase or titlecase */
        if((value&UCASE_TYPE_MASK)>=UCASE_UPPER) {
            delta=p->lowerCase-p->code;
        } else {
            value|=UCASE_EXCEPTION;
        }
    }
    if(p->upperCase!=p->titleCase) {
        value|=UCASE_EXCEPTION;
    }
    if(p->closure[0]!=0) {
        value|=UCASE_EXCEPTION;
    }
    if(p->specialCasing!=NULL) {
        value|=UCASE_EXCEPTION;
    }
    if(p->caseFolding!=NULL) {
        value|=UCASE_EXCEPTION;
    }

    if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) {
        value|=UCASE_EXCEPTION;
    }

    if(p->cc!=0) {
        if(value&UCASE_DOT_MASK) {
            fprintf(stderr, "gencase: a soft-dotted character has cc!=0\n");
            exit(U_INTERNAL_PROGRAM_ERROR);
        }
        if(p->cc==230) {
            value|=UCASE_ABOVE;
        } else {
            value|=UCASE_OTHER_ACCENT;
        }
    }

    /* encode case-ignorable as delta==1 on uncased characters */
    isCaseIgnorable=FALSE;
    if((value&UCASE_TYPE_MASK)==UCASE_NONE) {
        if(ucdVersion>=UNI_4_1) {
            /*
             * Unicode 4.1 and up: (D47a) Word_Break=MidLetter or Mn, Me, Cf, Lm, Sk
             * Unicode 5.1 and up: Word_Break=(MidLetter or MidNumLet) or Mn, Me, Cf, Lm, Sk
             *   The UGENCASE_IS_MID_LETTER_SHIFT bit is set for both WB=MidLetter and WB=MidNumLet.
             */
            if(
                (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 ||
                (upvec_getValue(pv, p->code, 1)&U_MASK(UGENCASE_IS_MID_LETTER_SHIFT))!=0
            ) {
                isCaseIgnorable=TRUE;
            }
        } else {
            /* before Unicode 4.1: Mn, Me, Cf, Lm, Sk or 0027 or 00AD or 2019 */
            if(
                (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 ||
                p->code==0x27 || p->code==0xad || p->code==0x2019
            ) {
                isCaseIgnorable=TRUE;
            }
        }
    }

    if(isCaseIgnorable && p->code!=0x307) {
        /*
         * We use one of the delta/exception bits, which works because we only
         * store the case-ignorable flag for uncased characters.
         * There is no delta for uncased characters (see checks above).
         * If there is an exception for an uncased, case-ignorable character
         * (although there should not be any case mappings if it's uncased)
         * then we have a problem.
         * There is one character which is case-ignorable but has an exception:
         * U+0307 is uncased, Mn, has conditional special casing and
         * is therefore handled in code instead.
         */
        if(value&UCASE_EXCEPTION) {
            fprintf(stderr, "gencase error: unable to encode case-ignorable for U+%04lx with exceptions\n",
                            (unsigned long)p->code);
            exit(U_INTERNAL_PROGRAM_ERROR);
        }

        delta=1;
    }

    /* handle exceptions */
    if(value&UCASE_EXCEPTION) {
        /* simply store exceptions for later processing and encoding */
        value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT;
        uprv_memcpy(excProps+exceptionsCount, p, sizeof(*p));
        if(++exceptionsCount==MAX_EXC_COUNT) {
            fprintf(stderr, "gencase: too many exceptions\n");
            exit(U_INDEX_OUTOFBOUNDS_ERROR);
        }
    } else {
        /* store the simple case mapping delta */
        value|=((uint32_t)delta<<UCASE_DELTA_SHIFT)&UCASE_DELTA_MASK;
    }

    errorCode=U_ZERO_ERROR;
    if(value!=oldValue) {
        upvec_setValue(pv, p->code, p->code, 0, value, 0xffffffff, &errorCode);
        if(U_FAILURE(errorCode)) {
            fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n",
                            u_errorName(errorCode));
            exit(errorCode);
        }
    }

    /* add the multi-character case folding to the "unfold" data */
    if(p->caseFolding!=NULL) {
        int32_t length=p->caseFolding->full[0];
        if(length>1 && u_strHasMoreChar32Than(p->caseFolding->full+1, length, 1)) {
            addUnfolding(p->code, p->caseFolding->full+1, length);
        }
    }
}