static UBool getSystemTimeInformation(TimeZone *tz, SYSTEMTIME &daylightDate, SYSTEMTIME &standardDate, int32_t &bias, int32_t &daylightBias, int32_t &standardBias) {
    UErrorCode status = U_ZERO_ERROR;
    UBool result = TRUE;
    BasicTimeZone *btz = (BasicTimeZone*)tz; // we should check type
    InitialTimeZoneRule *initial = NULL;
    AnnualTimeZoneRule *std = NULL, *dst = NULL;

    btz->getSimpleRulesNear(uprv_getUTCtime(), initial, std, dst, status);
    if (U_SUCCESS(status)) {
        if (std == NULL || dst == NULL) {
            bias = -1 * (initial->getRawOffset()/60000);
            daylightBias = 0;
            // Do not use DST.  Set 0 to all stadardDate/daylightDate fields
            standardDate.wYear = standardDate.wMonth  = standardDate.wDayOfWeek = standardDate.wDay = 
            standardDate.wHour = standardDate.wMinute = standardDate.wSecond    = standardDate.wMilliseconds = 0;
            daylightDate.wYear = daylightDate.wMonth  = daylightDate.wDayOfWeek = daylightDate.wDay =
            daylightDate.wHour = daylightDate.wMinute = daylightDate.wSecond    = daylightDate.wMilliseconds = 0;
        } else {
            U_ASSERT(std->getRule()->getDateRuleType() == DateTimeRule::DOW);
            U_ASSERT(dst->getRule()->getDateRuleType() == DateTimeRule::DOW);

            bias = -1 * (std->getRawOffset()/60000);
            daylightBias = -1 * (dst->getDSTSavings()/60000);
            // Always use DOW type rule
            int32_t hour, min, sec, mil;
            standardDate.wYear = 0;
            standardDate.wMonth = std->getRule()->getRuleMonth() + 1;
            standardDate.wDay = std->getRule()->getRuleWeekInMonth();
            if (standardDate.wDay < 0) {
                standardDate.wDay = 5;
            }
            standardDate.wDayOfWeek = std->getRule()->getRuleDayOfWeek() - 1;

            mil = std->getRule()->getRuleMillisInDay();
            hour = mil/3600000;
            mil %= 3600000;
            min = mil/60000;
            mil %= 60000;
            sec = mil/1000;
            mil %= 1000;

            standardDate.wHour = hour;
            standardDate.wMinute = min;
            standardDate.wSecond = sec;
            standardDate.wMilliseconds = mil;

            daylightDate.wYear = 0;
            daylightDate.wMonth = dst->getRule()->getRuleMonth() + 1;
            daylightDate.wDay = dst->getRule()->getRuleWeekInMonth();
            if (daylightDate.wDay < 0) {
                daylightDate.wDay = 5;
            }
            daylightDate.wDayOfWeek = dst->getRule()->getRuleDayOfWeek() - 1;

            mil = dst->getRule()->getRuleMillisInDay();
            hour = mil/3600000;
            mil %= 3600000;
            min = mil/60000;
            mil %= 60000;
            sec = mil/1000;
            mil %= 1000;

            daylightDate.wHour = hour;
            daylightDate.wMinute = min;
            daylightDate.wSecond = sec;
            daylightDate.wMilliseconds = mil;
        }
    } else {
        result = FALSE;
    }

    delete initial;
    delete std;
    delete dst;

    return result;
}
Beispiel #2
0
UnicodeString&
TZGNCore::formatGenericNonLocationName(const TimeZone& tz, UTimeZoneGenericNameType type, UDate date, UnicodeString& name) const {
    U_ASSERT(type == UTZGNM_LONG || type == UTZGNM_SHORT);
    name.setToBogus();

    const UChar* uID = ZoneMeta::getCanonicalCLDRID(tz);
    if (uID == NULL) {
        return name;
    }

    UnicodeString tzID(TRUE, uID, -1);

    // Try to get a name from time zone first
    UTimeZoneNameType nameType = (type == UTZGNM_LONG) ? UTZNM_LONG_GENERIC : UTZNM_SHORT_GENERIC;
    fTimeZoneNames->getTimeZoneDisplayName(tzID, nameType, name);

    if (!name.isEmpty()) {
        return name;
    }

    // Try meta zone
    UChar mzIDBuf[32];
    UnicodeString mzID(mzIDBuf, 0, UPRV_LENGTHOF(mzIDBuf));
    fTimeZoneNames->getMetaZoneID(tzID, date, mzID);
    if (!mzID.isEmpty()) {
        UErrorCode status = U_ZERO_ERROR;
        UBool useStandard = FALSE;
        int32_t raw, sav;
        UChar tmpNameBuf[64];

        tz.getOffset(date, FALSE, raw, sav, status);
        if (U_FAILURE(status)) {
            return name;
        }

        if (sav == 0) {
            useStandard = TRUE;

            TimeZone *tmptz = tz.clone();
            // Check if the zone actually uses daylight saving time around the time
            BasicTimeZone *btz = NULL;
            if (dynamic_cast<OlsonTimeZone *>(tmptz) != NULL
                || dynamic_cast<SimpleTimeZone *>(tmptz) != NULL
                || dynamic_cast<RuleBasedTimeZone *>(tmptz) != NULL
                || dynamic_cast<VTimeZone *>(tmptz) != NULL) {
                btz = (BasicTimeZone*)tmptz;
            }

            if (btz != NULL) {
                TimeZoneTransition before;
                UBool beforTrs = btz->getPreviousTransition(date, TRUE, before);
                if (beforTrs
                        && (date - before.getTime() < kDstCheckRange)
                        && before.getFrom()->getDSTSavings() != 0) {
                    useStandard = FALSE;
                } else {
                    TimeZoneTransition after;
                    UBool afterTrs = btz->getNextTransition(date, FALSE, after);
                    if (afterTrs
                            && (after.getTime() - date < kDstCheckRange)
                            && after.getTo()->getDSTSavings() != 0) {
                        useStandard = FALSE;
                    }
                }
            } else {
                // If not BasicTimeZone... only if the instance is not an ICU's implementation.
                // We may get a wrong answer in edge case, but it should practically work OK.
                tmptz->getOffset(date - kDstCheckRange, FALSE, raw, sav, status);
                if (sav != 0) {
                    useStandard = FALSE;
                } else {
                    tmptz->getOffset(date + kDstCheckRange, FALSE, raw, sav, status);
                    if (sav != 0){
                        useStandard = FALSE;
                    }
                }
                if (U_FAILURE(status)) {
                    delete tmptz;
                    return name;
                }
            }
            delete tmptz;
        }
        if (useStandard) {
            UTimeZoneNameType stdNameType = (nameType == UTZNM_LONG_GENERIC)
                ? UTZNM_LONG_STANDARD : UTZNM_SHORT_STANDARD;
            UnicodeString stdName(tmpNameBuf, 0, UPRV_LENGTHOF(tmpNameBuf));
            fTimeZoneNames->getDisplayName(tzID, stdNameType, date, stdName);
            if (!stdName.isEmpty()) {
                name.setTo(stdName);

                // TODO: revisit this issue later
                // In CLDR, a same display name is used for both generic and standard
                // for some meta zones in some locales.  This looks like a data bugs.
                // For now, we check if the standard name is different from its generic
                // name below.
                UChar genNameBuf[64];
                UnicodeString mzGenericName(genNameBuf, 0, UPRV_LENGTHOF(genNameBuf));
                fTimeZoneNames->getMetaZoneDisplayName(mzID, nameType, mzGenericName);
                if (stdName.caseCompare(mzGenericName, 0) == 0) {
                    name.setToBogus();
                }
            }
        }
        if (name.isEmpty()) {
            // Get a name from meta zone
            UnicodeString mzName(tmpNameBuf, 0, UPRV_LENGTHOF(tmpNameBuf));
            fTimeZoneNames->getMetaZoneDisplayName(mzID, nameType, mzName);
            if (!mzName.isEmpty()) {
                // Check if we need to use a partial location format.
                // This check is done by comparing offset with the meta zone's
                // golden zone at the given date.
                UChar idBuf[32];
                UnicodeString goldenID(idBuf, 0, UPRV_LENGTHOF(idBuf));
                fTimeZoneNames->getReferenceZoneID(mzID, fTargetRegion, goldenID);
                if (!goldenID.isEmpty() && goldenID != tzID) {
                    TimeZone *goldenZone = TimeZone::createTimeZone(goldenID);
                    int32_t raw1, sav1;

                    // Check offset in the golden zone with wall time.
                    // With getOffset(date, false, offsets1),
                    // you may get incorrect results because of time overlap at DST->STD
                    // transition.
                    goldenZone->getOffset(date + raw + sav, TRUE, raw1, sav1, status);
                    delete goldenZone;
                    if (U_SUCCESS(status)) {
                        if (raw != raw1 || sav != sav1) {
                            // Now we need to use a partial location format
                            getPartialLocationName(tzID, mzID, (nameType == UTZNM_LONG_GENERIC), mzName, name);
                        } else {
                            name.setTo(mzName);
                        }
                    }
                } else {
                    name.setTo(mzName);
                }
            }
        }
    }
    return name;
}
static int32_t
_internal_toUnicode(const UChar* src, int32_t srcLength,
                    UChar* dest, int32_t destCapacity,
                    int32_t options,
                    UStringPrepProfile* nameprep,
                    UParseError* parseError,
                    UErrorCode* status)
{

    //get the options
    //UBool useSTD3ASCIIRules = (UBool)((options & UIDNA_USE_STD3_RULES) != 0);
    int32_t namePrepOptions = ((options & UIDNA_ALLOW_UNASSIGNED) != 0) ? USPREP_ALLOW_UNASSIGNED: 0; 

    // TODO Revisit buffer handling. The label should not be over 63 ASCII characters. ICU4J may need to be updated too.
    UChar b1Stack[MAX_LABEL_BUFFER_SIZE], b2Stack[MAX_LABEL_BUFFER_SIZE], b3Stack[MAX_LABEL_BUFFER_SIZE];

    //initialize pointers to stack buffers
    UChar  *b1 = b1Stack, *b2 = b2Stack, *b1Prime=NULL, *b3=b3Stack;
    int32_t b1Len, b2Len, b1PrimeLen, b3Len,
            b1Capacity = MAX_LABEL_BUFFER_SIZE, 
            b2Capacity = MAX_LABEL_BUFFER_SIZE,
            b3Capacity = MAX_LABEL_BUFFER_SIZE,
            reqLength=0;

    b1Len = 0;
    UBool* caseFlags = NULL;

    UBool srcIsASCII = TRUE;
    /*UBool srcIsLDH = TRUE;
    int32_t failPos =0;*/

    // step 1: find out if all the codepoints in src are ASCII  
    if(srcLength==-1){
        srcLength = 0;
        for(;src[srcLength]!=0;){
            if(src[srcLength]> 0x7f){
                srcIsASCII = FALSE;
            }/*else if(isLDHChar(src[srcLength])==FALSE){
                // here we do not assemble surrogates
                // since we know that LDH code points
                // are in the ASCII range only
                srcIsLDH = FALSE;
                failPos = srcLength;
            }*/
            srcLength++;
        }
    }else if(srcLength > 0){
        for(int32_t j=0; j<srcLength; j++){
            if(src[j]> 0x7f){
                srcIsASCII = FALSE;
            }/*else if(isLDHChar(src[j])==FALSE){
                // here we do not assemble surrogates
                // since we know that LDH code points
                // are in the ASCII range only
                srcIsLDH = FALSE;
                failPos = j;
            }*/
        }
    }else{
        return 0;
    }
    
    if(srcIsASCII == FALSE){
        // step 2: process the string
        b1Len = usprep_prepare(nameprep, src, srcLength, b1, b1Capacity, namePrepOptions, parseError, status);
        if(*status == U_BUFFER_OVERFLOW_ERROR){
            // redo processing of string
            /* we do not have enough room so grow the buffer*/
            b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
            if(b1==NULL){
                *status = U_MEMORY_ALLOCATION_ERROR;
                goto CLEANUP;
            }

            *status = U_ZERO_ERROR; // reset error
            
            b1Len = usprep_prepare(nameprep, src, srcLength, b1, b1Len, namePrepOptions, parseError, status);
        }
        //bail out on error
        if(U_FAILURE(*status)){
            goto CLEANUP;
        }
    }else{

        //just point src to b1
        b1 = (UChar*) src;
        b1Len = srcLength;
    }

    // The RFC states that 
    // <quote>
    // ToUnicode never fails. If any step fails, then the original input
    // is returned immediately in that step.
    // </quote>

    //step 3: verify ACE Prefix
    if(startsWithPrefix(b1,b1Len)){

        //step 4: Remove the ACE Prefix
        b1Prime = b1 + ACE_PREFIX_LENGTH;
        b1PrimeLen  = b1Len - ACE_PREFIX_LENGTH;

        //step 5: Decode using punycode
        b2Len = u_strFromPunycode(b1Prime, b1PrimeLen, b2, b2Capacity, caseFlags,status);

        if(*status == U_BUFFER_OVERFLOW_ERROR){
            // redo processing of string
            /* we do not have enough room so grow the buffer*/
            b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR);
            if(b2==NULL){
                *status = U_MEMORY_ALLOCATION_ERROR;
                goto CLEANUP;
            }

            *status = U_ZERO_ERROR; // reset error

            b2Len =  u_strFromPunycode(b1Prime, b1PrimeLen, b2, b2Len, caseFlags, status);
        }


        //step 6:Apply toASCII
        b3Len = uidna_toASCII(b2, b2Len, b3, b3Capacity, options, parseError, status);

        if(*status == U_BUFFER_OVERFLOW_ERROR){
            // redo processing of string
            /* we do not have enough room so grow the buffer*/
            b3 = (UChar*) uprv_malloc(b3Len * U_SIZEOF_UCHAR);
            if(b3==NULL){
                *status = U_MEMORY_ALLOCATION_ERROR;
                goto CLEANUP;
            }

            *status = U_ZERO_ERROR; // reset error

            b3Len =  uidna_toASCII(b2,b2Len,b3,b3Len,options,parseError, status);

        }
        //bail out on error
        if(U_FAILURE(*status)){
            goto CLEANUP;
        }

        //step 7: verify
        if(compareCaseInsensitiveASCII(b1, b1Len, b3, b3Len) !=0){
            // Cause the original to be returned.
            *status = U_IDNA_VERIFICATION_ERROR;
            goto CLEANUP;
        }

        //step 8: return output of step 5
        reqLength = b2Len;
        if(b2Len <= destCapacity) {
            uprv_memmove(dest, b2, b2Len * U_SIZEOF_UCHAR);
        }
    }
    else{
        // See the start of this if statement for why this is commented out.
        // verify that STD3 ASCII rules are satisfied
        /*if(useSTD3ASCIIRules == TRUE){
            if( srcIsLDH == FALSE // source contains some non-LDH characters
                || src[0] ==  HYPHEN || src[srcLength-1] == HYPHEN){
                *status = U_IDNA_STD3_ASCII_RULES_ERROR;

                // populate the parseError struct
                if(srcIsLDH==FALSE){
                    // failPos is always set the index of failure
                    uprv_syntaxError(src,failPos, srcLength,parseError);
                }else if(src[0] == HYPHEN){
                    // fail position is 0 
                    uprv_syntaxError(src,0,srcLength,parseError);
                }else{
                    // the last index in the source is always length-1
                    uprv_syntaxError(src, (srcLength>0) ? srcLength-1 : srcLength, srcLength,parseError);
                }

                goto CLEANUP;
            }
        }*/
        // just return the source
        //copy the source to destination
        if(srcLength <= destCapacity){
            uprv_memmove(dest,src,srcLength * U_SIZEOF_UCHAR);
        }
        reqLength = srcLength;
    }


CLEANUP:

    if(b1 != b1Stack && b1!=src){
        uprv_free(b1);
    }
    if(b2 != b2Stack){
        uprv_free(b2);
    }
    uprv_free(caseFlags);

    // The RFC states that 
    // <quote>
    // ToUnicode never fails. If any step fails, then the original input
    // is returned immediately in that step.
    // </quote>
    // So if any step fails lets copy source to destination
    if(U_FAILURE(*status)){
        //copy the source to destination
        if(dest && srcLength <= destCapacity){
            // srcLength should have already been set earlier.
            U_ASSERT(srcLength >= 0);
            uprv_memmove(dest,src,srcLength * U_SIZEOF_UCHAR);
        }
        reqLength = srcLength;
        *status = U_ZERO_ERROR;
    }

    return u_terminateUChars(dest, destCapacity, reqLength, status);
}
Beispiel #4
0
void uBuildMemory(uType* type)
{
    U_ASSERT(type);
    if (!type->IsClosed())
        return;

    size_t strongCount = 0,
           weakCount = 0,
           objOffset = U_IS_OBJECT(type)
               ? sizeof(uObject)
               : 0,
           typeOffset = 0;

    if (type->Base)
        type->Base->Build();

    for (size_t i = 0; i < type->FieldCount; i++)
    {
        uFieldInfo& f = type->Fields[i];
        U_ASSERT(f.Type);

        if (f.Type != type && !U_IS_OBJECT(f.Type))
            f.Type->Build();

        if ((f.Flags & uFieldFlagsStatic) == 0)
        {
            if ((f.Flags & uFieldFlagsConstrained) == 0)
                objOffset = f.Offset + f.Type->ValueSize;

            if (U_IS_VALUE(f.Type))
            {
                strongCount += f.Type->Refs.StrongCount;
                weakCount += f.Type->Refs.WeakCount;
            }
            else if ((f.Flags & uFieldFlagsWeak) != 0)
                weakCount++;
            else
                strongCount++;
        }
        else if ((f.Flags & uFieldFlagsConstrained) != 0)
        {
            uAlignField(typeOffset, f.Type);
            f.Offset = typeOffset;
            typeOffset += f.Type->ValueSize;
        }
    }

    size_t size = typeOffset + (strongCount + weakCount) * sizeof(uRefInfo<size_t>);
    uint8_t* ptr = (uint8_t*)malloc(size); // Leak
    memset(ptr, 0, size);

    type->Refs.Strong = (uRefInfo<size_t>*)ptr;
    ptr += strongCount * sizeof(uRefInfo<size_t>);

    type->Refs.Weak = (uRefInfo<size_t>*)ptr;
    ptr += weakCount * sizeof(uRefInfo<size_t>);

    for (size_t i = 0; i < type->FieldCount; i++)
    {
#ifdef DEBUG_ARC
#define DEBUG_NAME ((Xli::String)type->FullName + "[" + (int)i + "]").CopyPtr(), // Leak
#else
#define DEBUG_NAME
#endif
        uFieldInfo& f = type->Fields[i];

        if ((f.Flags & uFieldFlagsStatic) == 0)
        {
            if ((f.Flags & uFieldFlagsConstrained) != 0)
            {
                uAlignField(objOffset, f.Type);
                f.Flags &= ~uFieldFlagsConstrained;
                f.Offset = objOffset;
                objOffset += f.Type->ValueSize;
            }

            if (U_IS_VALUE(f.Type))
            {
                f.Flags &= ~uFieldFlagsWeak;

                for (size_t j = 0; j < f.Type->Refs.StrongCount; j++)
                    type->Refs.Strong[type->Refs.StrongCount++] = f.Type->Refs.Strong[j] + f.Offset;
                for (size_t j = 0; j < f.Type->Refs.WeakCount; j++)
                    type->Refs.Weak[type->Refs.WeakCount++] = f.Type->Refs.Weak[j] + f.Offset;
            }
            else if ((f.Flags & uFieldFlagsWeak) != 0)
            {
                uRefInfo<size_t> ref = {DEBUG_NAME f.Offset};
                type->Refs.Weak[type->Refs.WeakCount++] = ref;
            }
            else
            {
                uRefInfo<size_t> ref = {DEBUG_NAME f.Offset};
                type->Refs.Strong[type->Refs.StrongCount++] = ref;
            }
        }
        else
        {
            if ((f.Flags & uFieldFlagsConstrained) != 0)
            {
                f.Flags &= ~uFieldFlagsConstrained;
                f.Offset += (uintptr_t)ptr;
            }

            if ((f.Flags & uFieldFlagsWeak) != 0)
            {
                uRefInfo<uWeakObject**> ref = {DEBUG_NAME (uWeakObject**)f.Offset};
                _WeakRefs->Add(ref);
            }
            else if (U_IS_OBJECT(f.Type))
            {
                uRefInfo<uObject**> ref = {DEBUG_NAME (uObject**)f.Offset};
                _StrongRefs->Add(ref);
            }
        }
#undef DEBUG_NAME
    }

    if (U_IS_VALUE(type))
    {
        if (objOffset != 0)
        {
            uAlignField(objOffset, type);
            U_ASSERT(type->ValueSize == objOffset || type->ValueSize == 0);
            type->ValueSize = objOffset;
        }

        type->ObjectSize = sizeof(uObject) + type->ValueSize;
    }
    else
    {
        if (type->Base && type->Base->ObjectSize > objOffset)
            objOffset = type->Base->ObjectSize;

        if (objOffset > type->ObjectSize)
            type->ObjectSize = objOffset;
    }

#ifdef DEBUG_UNSAFE
    uint8_t* layout = (uint8_t*)U_ALLOCA(type->ObjectSize);
    memset(layout, 0, type->ObjectSize);

    for (size_t i = 0; i < type->FieldCount; i++)
    {
        uFieldInfo& f = type->Fields[i];
        if ((f.Flags & uFieldFlagsStatic) == 0)
        {
            for (size_t j = 0; j < f.Type->ValueSize; j++)
            {
                U_ASSERT(f.Offset + j < type->ObjectSize);
                layout[f.Offset + j]++;
            }
        }
    }

    // Verify that no fields are overlapping
    for (size_t i = 0; i < type->ObjectSize; i++)
        U_ASSERT(layout[i] < 2);
#endif
}
/*
 * @param text A UText representing the text
 * @param rangeStart The start of the range of dictionary characters
 * @param rangeEnd The end of the range of dictionary characters
 * @param foundBreaks Output of C array of int32_t break positions, or 0
 * @return The number of breaks found
 */
int32_t 
CjkBreakEngine::divideUpDictionaryRange( UText *text,
        int32_t rangeStart,
        int32_t rangeEnd,
        UStack &foundBreaks ) const {
    if (rangeStart >= rangeEnd) {
        return 0;
    }

    const size_t defaultInputLength = 80;
    size_t inputLength = rangeEnd - rangeStart;
    // TODO: Replace by UnicodeString.
    AutoBuffer<UChar, defaultInputLength> charString(inputLength);

    // Normalize the input string and put it in normalizedText.
    // The map from the indices of the normalized input to the raw
    // input is kept in charPositions.
    UErrorCode status = U_ZERO_ERROR;
    utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);
    if (U_FAILURE(status)) {
        return 0;
    }

    UnicodeString inputString(charString.elems(), inputLength);
    // TODO: Use Normalizer2.
    UNormalizationMode norm_mode = UNORM_NFKC;
    UBool isNormalized =
        Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||
        Normalizer::isNormalized(inputString, norm_mode, status);

    // TODO: Replace by UVector32.
    AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);
    int numChars = 0;
    UText normalizedText = UTEXT_INITIALIZER;
    // Needs to be declared here because normalizedText holds onto its buffer.
    UnicodeString normalizedString;
    if (isNormalized) {
        int32_t index = 0;
        charPositions[0] = 0;
        while(index < inputString.length()) {
            index = inputString.moveIndex32(index, 1);
            charPositions[++numChars] = index;
        }
        utext_openUnicodeString(&normalizedText, &inputString, &status);
    }
    else {
        Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status);
        if (U_FAILURE(status)) {
            return 0;
        }
        charPositions.resize(normalizedString.length() + 1);
        Normalizer normalizer(charString.elems(), inputLength, norm_mode);
        int32_t index = 0;
        charPositions[0] = 0;
        while(index < normalizer.endIndex()){
            /* UChar32 uc = */ normalizer.next();
            charPositions[++numChars] = index = normalizer.getIndex();
        }
        utext_openUnicodeString(&normalizedText, &normalizedString, &status);
    }

    if (U_FAILURE(status)) {
        return 0;
    }

    // From this point on, all the indices refer to the indices of
    // the normalized input string.

    // bestSnlp[i] is the snlp of the best segmentation of the first i
    // characters in the range to be matched.
    // TODO: Replace by UVector32.
    AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);
    bestSnlp[0] = 0;
    for(int i = 1; i <= numChars; i++) {
        bestSnlp[i] = kuint32max;
    }

    // prev[i] is the index of the last CJK character in the previous word in 
    // the best segmentation of the first i characters.
    // TODO: Replace by UVector32.
    AutoBuffer<int, defaultInputLength> prev(numChars + 1);
    for(int i = 0; i <= numChars; i++){
        prev[i] = -1;
    }

    const size_t maxWordSize = 20;
    // TODO: Replace both with UVector32.
    AutoBuffer<int32_t, maxWordSize> values(numChars);
    AutoBuffer<int32_t, maxWordSize> lengths(numChars);

    // Dynamic programming to find the best segmentation.
    bool is_prev_katakana = false;
    for (int32_t i = 0; i < numChars; ++i) {
        //utext_setNativeIndex(text, rangeStart + i);
        utext_setNativeIndex(&normalizedText, i);
        if (bestSnlp[i] == kuint32max)
            continue;

        int32_t count;
        // limit maximum word length matched to size of current substring
        int32_t maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : (numChars - i);

        fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());

        // if there are no single character matches found in the dictionary 
        // starting with this charcter, treat character as a 1-character word 
        // with the highest value possible, i.e. the least likely to occur.
        // Exclude Korean characters from this treatment, as they should be left
        // together by default.
        if((count == 0 || lengths[0] != 1) &&
                !fHangulWordSet.contains(utext_current32(&normalizedText))) {
            values[count] = maxSnlp;
            lengths[count++] = 1;
        }

        for (int j = 0; j < count; j++) {
            uint32_t newSnlp = bestSnlp[i] + values[j];
            if (newSnlp < bestSnlp[lengths[j] + i]) {
                bestSnlp[lengths[j] + i] = newSnlp;
                prev[lengths[j] + i] = i;
            }
        }

        // In Japanese,
        // Katakana word in single character is pretty rare. So we apply
        // the following heuristic to Katakana: any continuous run of Katakana
        // characters is considered a candidate word with a default cost
        // specified in the katakanaCost table according to its length.
        //utext_setNativeIndex(text, rangeStart + i);
        utext_setNativeIndex(&normalizedText, i);
        bool is_katakana = isKatakana(utext_current32(&normalizedText));
        if (!is_prev_katakana && is_katakana) {
            int j = i + 1;
            utext_next32(&normalizedText);
            // Find the end of the continuous run of Katakana characters
            while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&
                    isKatakana(utext_current32(&normalizedText))) {
                utext_next32(&normalizedText);
                ++j;
            }
            if ((j - i) < kMaxKatakanaGroupLength) {
                uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
                if (newSnlp < bestSnlp[j]) {
                    bestSnlp[j] = newSnlp;
                    prev[j] = i;
                }
            }
        }
        is_prev_katakana = is_katakana;
    }

    // Start pushing the optimal offset index into t_boundary (t for tentative).
    // prev[numChars] is guaranteed to be meaningful.
    // We'll first push in the reverse order, i.e.,
    // t_boundary[0] = numChars, and afterwards do a swap.
    // TODO: Replace by UVector32.
    AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);

    int numBreaks = 0;
    // No segmentation found, set boundary to end of range
    if (bestSnlp[numChars] == kuint32max) {
        t_boundary[numBreaks++] = numChars;
    } else {
        for (int i = numChars; i > 0; i = prev[i]) {
            t_boundary[numBreaks++] = i;
        }
        U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0);
    }

    // Reverse offset index in t_boundary.
    // Don't add a break for the start of the dictionary range if there is one
    // there already.
    if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
        t_boundary[numBreaks++] = 0;
    }

    // Now that we're done, convert positions in t_bdry[] (indices in 
    // the normalized input string) back to indices in the raw input string
    // while reversing t_bdry and pushing values to foundBreaks.
    for (int i = numBreaks-1; i >= 0; i--) {
        foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);
    }

    utext_close(&normalizedText);
    return numBreaks;
}
Beispiel #6
0
U_CAPI void U_EXPORT2
umtx_condSignal(UConditionVar *cond) {
    int sysErr = pthread_cond_signal(&cond->fCondition);
    (void)sysErr;
    U_ASSERT(sysErr == 0);
}
Beispiel #7
0
static int32_t U_CALLCONV
ubidi_swap(const UDataSwapper *ds,
           const void *inData, int32_t length, void *outData,
           UErrorCode *pErrorCode) {
    const UDataInfo *pInfo;
    int32_t headerSize;

    const uint8_t *inBytes;
    uint8_t *outBytes;

    const int32_t *inIndexes;
    int32_t indexes[16];

    int32_t i, offset, count, size;

    /* udata_swapDataHeader checks the arguments */
    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }

    /* check data format and format version */
    pInfo=(const UDataInfo *)((const char *)inData+4);
    if(!(
        pInfo->dataFormat[0]==UBIDI_FMT_0 &&    /* dataFormat="BiDi" */
        pInfo->dataFormat[1]==UBIDI_FMT_1 &&
        pInfo->dataFormat[2]==UBIDI_FMT_2 &&
        pInfo->dataFormat[3]==UBIDI_FMT_3 &&
        ((pInfo->formatVersion[0]==1 &&
          pInfo->formatVersion[2]==UTRIE_SHIFT &&
          pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT) ||
         pInfo->formatVersion[0]==2)
    )) {
        udata_printError(ds, "ubidi_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as bidi/shaping data\n",
                         pInfo->dataFormat[0], pInfo->dataFormat[1],
                         pInfo->dataFormat[2], pInfo->dataFormat[3],
                         pInfo->formatVersion[0]);
        *pErrorCode=U_UNSUPPORTED_ERROR;
        return 0;
    }

    inBytes=(const uint8_t *)inData+headerSize;
    outBytes=(uint8_t *)outData+headerSize;

    inIndexes=(const int32_t *)inBytes;

    if(length>=0) {
        length-=headerSize;
        if(length<16*4) {
            udata_printError(ds, "ubidi_swap(): too few bytes (%d after header) for bidi/shaping data\n",
                             length);
            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
            return 0;
        }
    }

    /* read the first 16 indexes (ICU 3.4/format version 1: UBIDI_IX_TOP==16, might grow) */
    for(i=0; i<16; ++i) {
        indexes[i]=udata_readInt32(ds, inIndexes[i]);
    }

    /* get the total length of the data */
    size=indexes[UBIDI_IX_LENGTH];

    if(length>=0) {
        if(length<size) {
            udata_printError(ds, "ubidi_swap(): too few bytes (%d after header) for all of bidi/shaping data\n",
                             length);
            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
            return 0;
        }

        /* copy the data for inaccessible bytes */
        if(inBytes!=outBytes) {
            uprv_memcpy(outBytes, inBytes, size);
        }

        offset=0;

        /* swap the int32_t indexes[] */
        count=indexes[UBIDI_IX_INDEX_TOP]*4;
        ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode);
        offset+=count;

        /* swap the UTrie */
        count=indexes[UBIDI_IX_TRIE_SIZE];
        utrie2_swapAnyVersion(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
        offset+=count;

        /* swap the uint32_t mirrors[] */
        count=indexes[UBIDI_IX_MIRROR_LENGTH]*4;
        ds->swapArray32(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
        offset+=count;

        /* just skip the uint8_t jgArray[] */
        count=indexes[UBIDI_IX_JG_LIMIT]-indexes[UBIDI_IX_JG_START];
        offset+=count;

        U_ASSERT(offset==size);
    }

    return headerSize+size;
}
Beispiel #8
0
/**
* Override Calendar to compute several fields specific to the Islamic
* calendar system.  These are:
*
* <ul><li>ERA
* <li>YEAR
* <li>MONTH
* <li>DAY_OF_MONTH
* <li>DAY_OF_YEAR
* <li>EXTENDED_YEAR</ul>
* 
* The DAY_OF_WEEK and DOW_LOCAL fields are already set when this
* method is called. The getGregorianXxx() methods return Gregorian
* calendar equivalents for the given Julian day.
* @draft ICU 2.4
*/
void IslamicCalendar::handleComputeFields(int32_t julianDay, UErrorCode &status) {
    int32_t year, month, dayOfMonth, dayOfYear;
    int32_t startDate;
    int32_t days = julianDay - CIVIL_EPOC;

    if (cType == CIVIL || cType == TBLA) {
        if(cType == TBLA) {
            days = julianDay - ASTRONOMICAL_EPOC;
        }
        // Use the civil calendar approximation, which is just arithmetic
        year  = (int)ClockMath::floorDivide( (double)(30 * days + 10646) , 10631.0 );
        month = (int32_t)uprv_ceil((days - 29 - yearStart(year)) / 29.5 );
        month = month<11?month:11;
        startDate = monthStart(year, month);
    } else if(cType == ASTRONOMICAL){
        // Guess at the number of elapsed full months since the epoch
        int32_t months = (int32_t)uprv_floor((double)days / CalendarAstronomer::SYNODIC_MONTH);

        startDate = (int32_t)uprv_floor(months * CalendarAstronomer::SYNODIC_MONTH);

        double age = moonAge(internalGetTime(), status);
        if (U_FAILURE(status)) {
            status = U_MEMORY_ALLOCATION_ERROR;
            return;
        }
        if ( days - startDate >= 25 && age > 0) {
            // If we're near the end of the month, assume next month and search backwards
            months++;
        }

        // Find out the last time that the new moon was actually visible at this longitude
        // This returns midnight the night that the moon was visible at sunset.
        while ((startDate = trueMonthStart(months)) > days) {
            // If it was after the date in question, back up a month and try again
            months--;
        }

        year = months / 12 + 1;
        month = months % 12;
    } else if(cType == UMALQURA) {
        int32_t umalquraStartdays = yearStart(UMALQURA_YEAR_START) ;
        if( days < umalquraStartdays){
                //Use Civil calculation
                year  = (int)ClockMath::floorDivide( (double)(30 * days + 10646) , 10631.0 );
                month = (int32_t)uprv_ceil((days - 29 - yearStart(year)) / 29.5 );
                month = month<11?month:11;
                startDate = monthStart(year, month);
            }else{
                int y =UMALQURA_YEAR_START-1, m =0;
                long d = 1;
                while(d > 0){ 
                    y++; 
                    d = days - yearStart(y) +1;
                    if(d == handleGetYearLength(y)){
                        m=11;
                        break;
                    }else if(d < handleGetYearLength(y) ){
                        int monthLen = handleGetMonthLength(y, m); 
                        m=0;
                        while(d > monthLen){
                            d -= monthLen;
                            m++;
                            monthLen = handleGetMonthLength(y, m);
                        }
                        break;
                    }
                }
                year = y;
                month = m;
            }
    } else { // invalid 'civil'
      U_ASSERT(false); // should not get here, out of range
      year=month=0;
    }

    dayOfMonth = (days - monthStart(year, month)) + 1;

    // Now figure out the day of the year.
    dayOfYear = (days - monthStart(year, 0)) + 1;


    internalSet(UCAL_ERA, 0);
    internalSet(UCAL_YEAR, year);
    internalSet(UCAL_EXTENDED_YEAR, year);
    internalSet(UCAL_MONTH, month);
    internalSet(UCAL_DAY_OF_MONTH, dayOfMonth);
    internalSet(UCAL_DAY_OF_YEAR, dayOfYear);       
}    
Beispiel #9
0
//------------------------------------------------------------------------------
//
//  scanSet    Construct a UnicodeSet from the text at the current scan
//             position.  Advance the scan position to the first character
//             after the set.
//
//             A new RBBI setref node referring to the set is pushed onto the node
//             stack.
//
//             The scan position is normally under the control of the state machine
//             that controls rule parsing.  UnicodeSets, however, are parsed by
//             the UnicodeSet constructor, not by the RBBI rule parser.
//
//------------------------------------------------------------------------------
void RBBIRuleScanner::scanSet() {
    UnicodeSet    *uset;
    ParsePosition  pos;
    int            startPos;
    int            i;

    if (U_FAILURE(*fRB->fStatus)) {
        return;
    }

    pos.setIndex(fScanIndex);
    startPos = fScanIndex;
    UErrorCode localStatus = U_ZERO_ERROR;
    uset = new UnicodeSet();
    if (uset == NULL) {
        localStatus = U_MEMORY_ALLOCATION_ERROR;
    } else {
        uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);
    }
    if (U_FAILURE(localStatus)) {
        //  TODO:  Get more accurate position of the error from UnicodeSet's return info.
        //         UnicodeSet appears to not be reporting correctly at this time.
        #ifdef RBBI_DEBUG
            RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
        #endif
        error(localStatus);
        delete uset;
        return;
    }

    // Verify that the set contains at least one code point.
    //
    U_ASSERT(uset!=NULL);
    if (uset->isEmpty()) {
        // This set is empty.
        //  Make it an error, because it almost certainly is not what the user wanted.
        //  Also, avoids having to think about corner cases in the tree manipulation code
        //   that occurs later on.
        error(U_BRK_RULE_EMPTY_SET);
        delete uset;
        return;
    }


    // Advance the RBBI parse postion over the UnicodeSet pattern.
    //   Don't just set fScanIndex because the line/char positions maintained
    //   for error reporting would be thrown off.
    i = pos.getIndex();
    for (;;) {
        if (fNextIndex >= i) {
            break;
        }
        nextCharLL();
    }

    if (U_SUCCESS(*fRB->fStatus)) {
        RBBINode         *n;

        n = pushNewNode(RBBINode::setRef);
        if (U_FAILURE(*fRB->fStatus)) {
            return;
        }
        n->fFirstPos = startPos;
        n->fLastPos  = fNextIndex;
        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
        //  findSetFor() serves several purposes here:
        //     - Adopts storage for the UnicodeSet, will be responsible for deleting.
        //     - Mantains collection of all sets in use, needed later for establishing
        //          character categories for run time engine.
        //     - Eliminates mulitiple instances of the same set.
        //     - Creates a new uset node if necessary (if this isn't a duplicate.)
        findSetFor(n->fText, n, uset);
    }

}
U_CAPI void U_EXPORT2
unum_setAttribute(    UNumberFormat*          fmt,
            UNumberFormatAttribute  attr,
            int32_t                 newValue)
{
  if (((NumberFormat*)fmt)->getDynamicClassID() == DecimalFormat::getStaticClassID()) {
    DecimalFormat* df = (DecimalFormat*) fmt;
    switch(attr) {
    case UNUM_PARSE_INT_ONLY:
        df->setParseIntegerOnly(newValue!=0);
        break;
        
    case UNUM_GROUPING_USED:
        df->setGroupingUsed(newValue!=0);
        break;
        
    case UNUM_DECIMAL_ALWAYS_SHOWN:
        df->setDecimalSeparatorAlwaysShown(newValue!=0);
        break;
        
    case UNUM_MAX_INTEGER_DIGITS:
        df->setMaximumIntegerDigits(newValue);
        break;
        
    case UNUM_MIN_INTEGER_DIGITS:
        df->setMinimumIntegerDigits(newValue);
        break;
        
    case UNUM_INTEGER_DIGITS:
        df->setMinimumIntegerDigits(newValue);
        df->setMaximumIntegerDigits(newValue);
        break;
        
    case UNUM_MAX_FRACTION_DIGITS:
        df->setMaximumFractionDigits(newValue);
        break;
        
    case UNUM_MIN_FRACTION_DIGITS:
        df->setMinimumFractionDigits(newValue);
        break;
        
    case UNUM_FRACTION_DIGITS:
        df->setMinimumFractionDigits(newValue);
        df->setMaximumFractionDigits(newValue);
        break;
        
    case UNUM_SIGNIFICANT_DIGITS_USED:
        df->setSignificantDigitsUsed(newValue!=0);
        break;

    case UNUM_MAX_SIGNIFICANT_DIGITS:
        df->setMaximumSignificantDigits(newValue);
        break;
        
    case UNUM_MIN_SIGNIFICANT_DIGITS:
        df->setMinimumSignificantDigits(newValue);
        break;
        
    case UNUM_MULTIPLIER:
        df->setMultiplier(newValue);    
        break;
        
    case UNUM_GROUPING_SIZE:
        df->setGroupingSize(newValue);    
        break;
        
    case UNUM_ROUNDING_MODE:
        df->setRoundingMode((DecimalFormat::ERoundingMode)newValue);
        break;
        
    case UNUM_FORMAT_WIDTH:
        df->setFormatWidth(newValue);
        break;
        
    case UNUM_PADDING_POSITION:
        /** The position at which padding will take place. */
        df->setPadPosition((DecimalFormat::EPadPosition)newValue);
        break;
        
    case UNUM_SECONDARY_GROUPING_SIZE:
        df->setSecondaryGroupingSize(newValue);
        break;
        
    default:
        /* Shouldn't get here anyway */
        break;
    }
  } else {
    U_ASSERT(((NumberFormat*)fmt)->getDynamicClassID() == RuleBasedNumberFormat::getStaticClassID());
    if (attr == UNUM_LENIENT_PARSE) {
#if !UCONFIG_NO_COLLATION
      ((RuleBasedNumberFormat*)fmt)->setLenient((UBool)newValue);
#endif
    }
  }
}
U_CAPI int32_t U_EXPORT2
unum_getTextAttribute(const UNumberFormat*  fmt,
            UNumberFormatTextAttribute      tag,
            UChar*                          result,
            int32_t                         resultLength,
            UErrorCode*                     status)
{
    if(U_FAILURE(*status))
        return -1;

    UnicodeString res;
    if(!(result==NULL && resultLength==0)) {
        // NULL destination for pure preflighting: empty dummy string
        // otherwise, alias the destination buffer
        res.setTo(result, 0, resultLength);
    }

    if (((const NumberFormat*)fmt)->getDynamicClassID() == DecimalFormat::getStaticClassID()) {
        const DecimalFormat* df = (const DecimalFormat*) fmt;
        switch(tag) {
        case UNUM_POSITIVE_PREFIX:
            df->getPositivePrefix(res);
            break;

        case UNUM_POSITIVE_SUFFIX:
            df->getPositiveSuffix(res);
            break;

        case UNUM_NEGATIVE_PREFIX:
            df->getNegativePrefix(res);
            break;

        case UNUM_NEGATIVE_SUFFIX:
            df->getNegativeSuffix(res);
            break;

        case UNUM_PADDING_CHARACTER:
            res = df->getPadCharacterString();
            break;

        case UNUM_CURRENCY_CODE:
            res = UnicodeString(df->getCurrency());
            break;

        default:
            *status = U_UNSUPPORTED_ERROR;
            return -1;
        }
    } else {
        U_ASSERT(((const NumberFormat*)fmt)->getDynamicClassID() == RuleBasedNumberFormat::getStaticClassID());
        const RuleBasedNumberFormat* rbnf = (const RuleBasedNumberFormat*)fmt;
        if (tag == UNUM_DEFAULT_RULESET) {
            res = rbnf->getDefaultRuleSetName();
        } else if (tag == UNUM_PUBLIC_RULESETS) {
            int32_t count = rbnf->getNumberOfRuleSetNames();
            for (int i = 0; i < count; ++i) {
                res += rbnf->getRuleSetName(i);
                res += (UChar)0x003b; // semicolon
            }
        } else {
            *status = U_UNSUPPORTED_ERROR;
            return -1;
        }
    }

    return res.extract(result, resultLength, *status);
}
U_CAPI int32_t U_EXPORT2
unum_getAttribute(const UNumberFormat*          fmt,
          UNumberFormatAttribute  attr)
{
  if (((const NumberFormat*)fmt)->getDynamicClassID() == DecimalFormat::getStaticClassID()) {
    const DecimalFormat* df = (const DecimalFormat*) fmt;
    switch(attr) {
    case UNUM_PARSE_INT_ONLY:
        return df->isParseIntegerOnly();
        
    case UNUM_GROUPING_USED:
        return df->isGroupingUsed();
        
    case UNUM_DECIMAL_ALWAYS_SHOWN:
        return df->isDecimalSeparatorAlwaysShown();    
        
    case UNUM_MAX_INTEGER_DIGITS:
        return df->getMaximumIntegerDigits();
        
    case UNUM_MIN_INTEGER_DIGITS:
        return df->getMinimumIntegerDigits();
        
    case UNUM_INTEGER_DIGITS:
        // TBD: what should this return?
        return df->getMinimumIntegerDigits();
        
    case UNUM_MAX_FRACTION_DIGITS:
        return df->getMaximumFractionDigits();
        
    case UNUM_MIN_FRACTION_DIGITS:
        return df->getMinimumFractionDigits();
        
    case UNUM_FRACTION_DIGITS:
        // TBD: what should this return?
        return df->getMinimumFractionDigits();
        
    case UNUM_SIGNIFICANT_DIGITS_USED:
        return df->areSignificantDigitsUsed();
        
    case UNUM_MAX_SIGNIFICANT_DIGITS:
        return df->getMaximumSignificantDigits();
        
    case UNUM_MIN_SIGNIFICANT_DIGITS:
        return df->getMinimumSignificantDigits();
        
    case UNUM_MULTIPLIER:
        return df->getMultiplier();    
        
    case UNUM_GROUPING_SIZE:
        return df->getGroupingSize();    
        
    case UNUM_ROUNDING_MODE:
        return df->getRoundingMode();
        
    case UNUM_FORMAT_WIDTH:
        return df->getFormatWidth();
        
    case UNUM_PADDING_POSITION:
        return df->getPadPosition();
        
    case UNUM_SECONDARY_GROUPING_SIZE:
        return df->getSecondaryGroupingSize();
        
    default:
        /* enums out of sync? unsupported enum? */
        break;
    }
  } else {
    U_ASSERT(((const NumberFormat*)fmt)->getDynamicClassID() == RuleBasedNumberFormat::getStaticClassID());
    if (attr == UNUM_LENIENT_PARSE) {
#if !UCONFIG_NO_COLLATION
      return ((const RuleBasedNumberFormat*)fmt)->isLenient();
#endif
    }
  }

  return -1;
}
UStringEnumeration::UStringEnumeration(UEnumeration* _uenum) :
    uenum(_uenum) {
    U_ASSERT(_uenum != 0);
}
Beispiel #14
0
const char *StandardPlural::getKeyword(Form p) {
    U_ASSERT(ZERO <= p && p < COUNT);
    return gKeywords[p];
}
Beispiel #15
0
//-----------------------------------------------------------------------------
//
//   buildStateTable()    Determine the set of runtime DFA states and the
//                        transition tables for these states, by the algorithm
//                        of fig. 3.44 in Aho.
//
//                        Most of the comments are quotes of Aho's psuedo-code.
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::buildStateTable() {
    if (U_FAILURE(*fStatus)) {
        return;
    }
    RBBIStateDescriptor *failState;
    // Set it to NULL to avoid uninitialized warning
    RBBIStateDescriptor *initialState = NULL;
    //
    // Add a dummy state 0 - the stop state.  Not from Aho.
    int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
    failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
    if (failState == NULL) {
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
        goto ExitBuildSTdeleteall;
    }
    failState->fPositions = new UVector(*fStatus);
    if (failState->fPositions == NULL) {
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
    }
    if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
        goto ExitBuildSTdeleteall;
    }
    fDStates->addElement(failState, *fStatus);
    if (U_FAILURE(*fStatus)) {
        goto ExitBuildSTdeleteall;
    }

    // initially, the only unmarked state in Dstates is firstpos(root),
    //       where toot is the root of the syntax tree for (r)#;
    initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
    if (initialState == NULL) {
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
    }
    if (U_FAILURE(*fStatus)) {
        goto ExitBuildSTdeleteall;
    }
    initialState->fPositions = new UVector(*fStatus);
    if (initialState->fPositions == NULL) {
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
    }
    if (U_FAILURE(*fStatus)) {
        goto ExitBuildSTdeleteall;
    }
    setAdd(initialState->fPositions, fTree->fFirstPosSet);
    fDStates->addElement(initialState, *fStatus);
    if (U_FAILURE(*fStatus)) {
        goto ExitBuildSTdeleteall;
    }

    // while there is an unmarked state T in Dstates do begin
    for (;;) {
        RBBIStateDescriptor *T = NULL;
        int32_t              tx;
        for (tx=1; tx<fDStates->size(); tx++) {
            RBBIStateDescriptor *temp;
            temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
            if (temp->fMarked == FALSE) {
                T = temp;
                break;
            }
        }
        if (T == NULL) {
            break;
        }

        // mark T;
        T->fMarked = TRUE;

        // for each input symbol a do begin
        int32_t  a;
        for (a = 1; a<=lastInputSymbol; a++) {
            // let U be the set of positions that are in followpos(p)
            //    for some position p in T
            //    such that the symbol at position p is a;
            UVector    *U = NULL;
            RBBINode   *p;
            int32_t     px;
            for (px=0; px<T->fPositions->size(); px++) {
                p = (RBBINode *)T->fPositions->elementAt(px);
                if ((p->fType == RBBINode::leafChar) &&  (p->fVal == a)) {
                    if (U == NULL) {
                        U = new UVector(*fStatus);
                        if (U == NULL) {
				*fStatus = U_MEMORY_ALLOCATION_ERROR;
				goto ExitBuildSTdeleteall;
                        }
                    }
                    setAdd(U, p->fFollowPos);
                }
            }

            // if U is not empty and not in DStates then
            int32_t  ux = 0;
            UBool    UinDstates = FALSE;
            if (U != NULL) {
                U_ASSERT(U->size() > 0);
                int  ix;
                for (ix=0; ix<fDStates->size(); ix++) {
                    RBBIStateDescriptor *temp2;
                    temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
                    if (setEquals(U, temp2->fPositions)) {
                        delete U;
                        U  = temp2->fPositions;
                        ux = ix;
                        UinDstates = TRUE;
                        break;
                    }
                }

                // Add U as an unmarked state to Dstates
                if (!UinDstates)
                {
                    RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
                    if (newState == NULL) {
			*fStatus = U_MEMORY_ALLOCATION_ERROR;
                    }
                    if (U_FAILURE(*fStatus)) {
                        goto ExitBuildSTdeleteall;
                    }
                    newState->fPositions = U;
                    fDStates->addElement(newState, *fStatus);
                    if (U_FAILURE(*fStatus)) {
                        return;
                    }
                    ux = fDStates->size()-1;
                }

                // Dtran[T, a] := U;
                T->fDtran->setElementAt(ux, a);
            }
        }
    }
    return;
    // delete local pointers only if error occured.
ExitBuildSTdeleteall:
    delete initialState;
    delete failState;
}
Beispiel #16
0
//------------------------------------------------------------------------------
//
//  doParseAction        Do some action during rule parsing.
//                       Called by the parse state machine.
//                       Actions build the parse tree and Unicode Sets,
//                       and maintain the parse stack for nested expressions.
//
//                       TODO:  unify EParseAction and RBBI_RuleParseAction enum types.
//                              They represent exactly the same thing.  They're separate
//                              only to work around enum forward declaration restrictions
//                              in some compilers, while at the same time avoiding multiple
//                              definitions problems.  I'm sure that there's a better way.
//
//------------------------------------------------------------------------------
UBool RBBIRuleScanner::doParseActions(int32_t action)
{
    RBBINode *n       = NULL;

    UBool   returnVal = TRUE;

    switch (action) {

    case doExprStart:
        pushNewNode(RBBINode::opStart);
        fRuleNum++;
        break;


    case doNoChain:
        // Scanned a '^' while on the rule start state.
        fNoChainInRule = TRUE;
        break;


    case doExprOrOperator:
        {
            fixOpStack(RBBINode::precOpCat);
            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
            RBBINode  *orNode      = pushNewNode(RBBINode::opOr);
            if (U_FAILURE(*fRB->fStatus)) {
                break;
            }
            orNode->fLeftChild     = operandNode;
            operandNode->fParent   = orNode;
        }
        break;

    case doExprCatOperator:
        // concatenation operator.
        // For the implicit concatenation of adjacent terms in an expression that are
        //   not separated by any other operator.  Action is invoked between the
        //   actions for the two terms.
        {
            fixOpStack(RBBINode::precOpCat);
            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
            RBBINode  *catNode     = pushNewNode(RBBINode::opCat);
            if (U_FAILURE(*fRB->fStatus)) {
                break;
            }
            catNode->fLeftChild    = operandNode;
            operandNode->fParent   = catNode;
        }
        break;

    case doLParen:
        // Open Paren.
        //   The openParen node is a dummy operation type with a low precedence,
        //     which has the affect of ensuring that any real binary op that
        //     follows within the parens binds more tightly to the operands than
        //     stuff outside of the parens.
        pushNewNode(RBBINode::opLParen);
        break;

    case doExprRParen:
        fixOpStack(RBBINode::precLParen);
        break;

    case doNOP:
        break;

    case doStartAssign:
        // We've just scanned "$variable = "
        // The top of the node stack has the $variable ref node.

        // Save the start position of the RHS text in the StartExpression node
        //   that precedes the $variableReference node on the stack.
        //   This will eventually be used when saving the full $variable replacement
        //   text as a string.
        n = fNodeStack[fNodeStackPtr-1];
        n->fFirstPos = fNextIndex;              // move past the '='

        // Push a new start-of-expression node; needed to keep parse of the
        //   RHS expression happy.
        pushNewNode(RBBINode::opStart);
        break;




    case doEndAssign:
        {
            // We have reached the end of an assignement statement.
            //   Current scan char is the ';' that terminates the assignment.

            // Terminate expression, leaves expression parse tree rooted in TOS node.
            fixOpStack(RBBINode::precStart);

            RBBINode *startExprNode  = fNodeStack[fNodeStackPtr-2];
            RBBINode *varRefNode     = fNodeStack[fNodeStackPtr-1];
            RBBINode *RHSExprNode    = fNodeStack[fNodeStackPtr];

            // Save original text of right side of assignment, excluding the terminating ';'
            //  in the root of the node for the right-hand-side expression.
            RHSExprNode->fFirstPos = startExprNode->fFirstPos;
            RHSExprNode->fLastPos  = fScanIndex;
            fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText);

            // Expression parse tree becomes l. child of the $variable reference node.
            varRefNode->fLeftChild = RHSExprNode;
            RHSExprNode->fParent   = varRefNode;

            // Make a symbol table entry for the $variableRef node.
            fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus);
            if (U_FAILURE(*fRB->fStatus)) {
                // This is a round-about way to get the parse position set
                //  so that duplicate symbols error messages include a line number.
                UErrorCode t = *fRB->fStatus;
                *fRB->fStatus = U_ZERO_ERROR;
                error(t);
            }

            // Clean up the stack.
            delete startExprNode;
            fNodeStackPtr-=3;
            break;
        }

    case doEndOfRule:
        {
        fixOpStack(RBBINode::precStart);      // Terminate expression, leaves expression
        if (U_FAILURE(*fRB->fStatus)) {       //   parse tree rooted in TOS node.
            break;
        }
#ifdef RBBI_DEBUG
        if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");}
#endif
        U_ASSERT(fNodeStackPtr == 1);
        RBBINode *thisRule = fNodeStack[fNodeStackPtr];

        // If this rule includes a look-ahead '/', add a endMark node to the
        //   expression tree.
        if (fLookAheadRule) {
            RBBINode  *endNode        = pushNewNode(RBBINode::endMark);
            RBBINode  *catNode        = pushNewNode(RBBINode::opCat);
            if (U_FAILURE(*fRB->fStatus)) {
                break;
            }
            fNodeStackPtr -= 2;
            catNode->fLeftChild       = thisRule;
            catNode->fRightChild      = endNode;
            fNodeStack[fNodeStackPtr] = catNode;
            endNode->fVal             = fRuleNum;
            endNode->fLookAheadEnd    = TRUE;
            thisRule                  = catNode;

            // TODO: Disable chaining out of look-ahead (hard break) rules.
            //   The break on rule match is forced, so there is no point in building up
            //   the state table to chain into another rule for a longer match.
        }

        // Mark this node as being the root of a rule.
        thisRule->fRuleRoot = TRUE;

        // Flag if chaining into this rule is wanted.
        //    
        if (fRB->fChainRules &&         // If rule chaining is enabled globally via !!chain
                !fNoChainInRule) {      //     and no '^' chain-in inhibit was on this rule
            thisRule->fChainIn = TRUE;
        }


        // All rule expressions are ORed together.
        // The ';' that terminates an expression really just functions as a '|' with
        //   a low operator prededence.
        //
        // Each of the four sets of rules are collected separately.
        //  (forward, reverse, safe_forward, safe_reverse)
        //  OR this rule into the appropriate group of them.
        //
        RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree);

        if (*destRules != NULL) {
            // This is not the first rule encounted.
            // OR previous stuff  (from *destRules)
            // with the current rule expression (on the Node Stack)
            //  with the resulting OR expression going to *destRules
            //
            RBBINode  *thisRule    = fNodeStack[fNodeStackPtr];
            RBBINode  *prevRules   = *destRules;
            RBBINode  *orNode      = pushNewNode(RBBINode::opOr);
            if (U_FAILURE(*fRB->fStatus)) {
                break;
            }
            orNode->fLeftChild     = prevRules;
            prevRules->fParent     = orNode;
            orNode->fRightChild    = thisRule;
            thisRule->fParent      = orNode;
            *destRules             = orNode;
        }
        else
        {
            // This is the first rule encountered (for this direction).
            // Just move its parse tree from the stack to *destRules.
            *destRules = fNodeStack[fNodeStackPtr];
        }
        fReverseRule   = FALSE;   // in preparation for the next rule.
        fLookAheadRule = FALSE;
        fNoChainInRule = FALSE;
        fNodeStackPtr  = 0;
        }
        break;


    case doRuleError:
        error(U_BRK_RULE_SYNTAX);
        returnVal = FALSE;
        break;


    case doVariableNameExpectedErr:
        error(U_BRK_RULE_SYNTAX);
        break;


    //
    //  Unary operands  + ? *
    //    These all appear after the operand to which they apply.
    //    When we hit one, the operand (may be a whole sub expression)
    //    will be on the top of the stack.
    //    Unary Operator becomes TOS, with the old TOS as its one child.
    case doUnaryOpPlus:
        {
            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
            RBBINode  *plusNode    = pushNewNode(RBBINode::opPlus);
            if (U_FAILURE(*fRB->fStatus)) {
                break;
            }
            plusNode->fLeftChild   = operandNode;
            operandNode->fParent   = plusNode;
        }
        break;

    case doUnaryOpQuestion:
        {
            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
            RBBINode  *qNode       = pushNewNode(RBBINode::opQuestion);
            if (U_FAILURE(*fRB->fStatus)) {
                break;
            }
            qNode->fLeftChild      = operandNode;
            operandNode->fParent   = qNode;
        }
        break;

    case doUnaryOpStar:
        {
            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
            RBBINode  *starNode    = pushNewNode(RBBINode::opStar);
            if (U_FAILURE(*fRB->fStatus)) {
                break;
            }
            starNode->fLeftChild   = operandNode;
            operandNode->fParent   = starNode;
        }
        break;

    case doRuleChar:
        // A "Rule Character" is any single character that is a literal part
        // of the regular expression.  Like a, b and c in the expression "(abc*) | [:L:]"
        // These are pretty uncommon in break rules; the terms are more commonly
        //  sets.  To keep things uniform, treat these characters like as
        // sets that just happen to contain only one character.
        {
            n = pushNewNode(RBBINode::setRef);
            if (U_FAILURE(*fRB->fStatus)) {
                break;
            }
            findSetFor(UnicodeString(fC.fChar), n);
            n->fFirstPos = fScanIndex;
            n->fLastPos  = fNextIndex;
            fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
            break;
        }

    case doDotAny:
        // scanned a ".", meaning match any single character.
        {
            n = pushNewNode(RBBINode::setRef);
            if (U_FAILURE(*fRB->fStatus)) {
                break;
            }
            findSetFor(UnicodeString(TRUE, kAny, 3), n);
            n->fFirstPos = fScanIndex;
            n->fLastPos  = fNextIndex;
            fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
            break;
        }

    case doSlash:
        // Scanned a '/', which identifies a look-ahead break position in a rule.
        n = pushNewNode(RBBINode::lookAhead);
        if (U_FAILURE(*fRB->fStatus)) {
            break;
        }
        n->fVal      = fRuleNum;
        n->fFirstPos = fScanIndex;
        n->fLastPos  = fNextIndex;
        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
        fLookAheadRule = TRUE;
        break;


    case doStartTagValue:
        // Scanned a '{', the opening delimiter for a tag value within a rule.
        n = pushNewNode(RBBINode::tag);
        if (U_FAILURE(*fRB->fStatus)) {
            break;
        }
        n->fVal      = 0;
        n->fFirstPos = fScanIndex;
        n->fLastPos  = fNextIndex;
        break;

    case doTagDigit:
        // Just scanned a decimal digit that's part of a tag value
        {
            n = fNodeStack[fNodeStackPtr];
            uint32_t v = u_charDigitValue(fC.fChar);
            U_ASSERT(v < 10);
            n->fVal = n->fVal*10 + v;
            break;
        }

    case doTagValue:
        n = fNodeStack[fNodeStackPtr];
        n->fLastPos = fNextIndex;
        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
        break;

    case doTagExpectedError:
        error(U_BRK_MALFORMED_RULE_TAG);
        returnVal = FALSE;
        break;

    case doOptionStart:
        // Scanning a !!option.   At the start of string.
        fOptionStart = fScanIndex;
        break;

    case doOptionEnd:
        {
            UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
            if (opt == UNICODE_STRING("chain", 5)) {
                fRB->fChainRules = TRUE;
            } else if (opt == UNICODE_STRING("LBCMNoChain", 11)) {
                fRB->fLBCMNoChain = TRUE;
            } else if (opt == UNICODE_STRING("forward", 7)) {
                fRB->fDefaultTree   = &fRB->fForwardTree;
            } else if (opt == UNICODE_STRING("reverse", 7)) {
                fRB->fDefaultTree   = &fRB->fReverseTree;
            } else if (opt == UNICODE_STRING("safe_forward", 12)) {
                fRB->fDefaultTree   = &fRB->fSafeFwdTree;
            } else if (opt == UNICODE_STRING("safe_reverse", 12)) {
                fRB->fDefaultTree   = &fRB->fSafeRevTree;
            } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) {
                fRB->fLookAheadHardBreak = TRUE;
            } else {
                error(U_BRK_UNRECOGNIZED_OPTION);
            }
        }
        break;

    case doReverseDir:
        fReverseRule = TRUE;
        break;

    case doStartVariableName:
        n = pushNewNode(RBBINode::varRef);
        if (U_FAILURE(*fRB->fStatus)) {
            break;
        }
        n->fFirstPos = fScanIndex;
        break;

    case doEndVariableName:
        n = fNodeStack[fNodeStackPtr];
        if (n==NULL || n->fType != RBBINode::varRef) {
            error(U_BRK_INTERNAL_ERROR);
            break;
        }
        n->fLastPos = fScanIndex;
        fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText);
        // Look the newly scanned name up in the symbol table
        //   If there's an entry, set the l. child of the var ref to the replacement expression.
        //   (We also pass through here when scanning assignments, but no harm is done, other
        //    than a slight wasted effort that seems hard to avoid.  Lookup will be null)
        n->fLeftChild = fSymbolTable->lookupNode(n->fText);
        break;

    case doCheckVarDef:
        n = fNodeStack[fNodeStackPtr];
        if (n->fLeftChild == NULL) {
            error(U_BRK_UNDEFINED_VARIABLE);
            returnVal = FALSE;
        }
        break;

    case doExprFinished:
        break;

    case doRuleErrorAssignExpr:
        error(U_BRK_ASSIGN_ERROR);
        returnVal = FALSE;
        break;

    case doExit:
        returnVal = FALSE;
        break;

    case doScanUnicodeSet:
        scanSet();
        break;

    default:
        error(U_BRK_INTERNAL_ERROR);
        returnVal = FALSE;
        break;
    }
    return returnVal && U_SUCCESS(*fRB->fStatus);
}
Beispiel #17
0
U_CAPI void U_EXPORT2
umtx_condBroadcast(UConditionVar *cond) {
    int sysErr = pthread_cond_broadcast(&cond->fCondition);
    (void)sysErr;
    U_ASSERT(sysErr == 0);
}
Beispiel #18
0
//------------------------------------------------------------------------------
//
//   findSetFor    given a UnicodeString,
//                  - find the corresponding Unicode Set  (uset node)
//                         (create one if necessary)
//                  - Set fLeftChild of the caller's node (should be a setRef node)
//                         to the uset node
//                 Maintain a hash table of uset nodes, so the same one is always used
//                    for the same string.
//                 If a "to adopt" set is provided and we haven't seen this key before,
//                    add the provided set to the hash table.
//                 If the string is one (32 bit) char in length, the set contains
//                    just one element which is the char in question.
//                 If the string is "any", return a set containing all chars.
//
//------------------------------------------------------------------------------
void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) {

    RBBISetTableEl   *el;

    // First check whether we've already cached a set for this string.
    // If so, just use the cached set in the new node.
    //   delete any set provided by the caller, since we own it.
    el = (RBBISetTableEl *)uhash_get(fSetTable, &s);
    if (el != NULL) {
        delete setToAdopt;
        node->fLeftChild = el->val;
        U_ASSERT(node->fLeftChild->fType == RBBINode::uset);
        return;
    }

    // Haven't seen this set before.
    // If the caller didn't provide us with a prebuilt set,
    //   create a new UnicodeSet now.
    if (setToAdopt == NULL) {
        if (s.compare(kAny, -1) == 0) {
            setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
        } else {
            UChar32 c;
            c = s.char32At(0);
            setToAdopt = new UnicodeSet(c, c);
        }
    }

    //
    // Make a new uset node to refer to this UnicodeSet
    // This new uset node becomes the child of the caller's setReference node.
    //
    RBBINode *usetNode    = new RBBINode(RBBINode::uset);
    if (usetNode == NULL) {
        error(U_MEMORY_ALLOCATION_ERROR);
        return;
    }
    usetNode->fInputSet   = setToAdopt;
    usetNode->fParent     = node;
    node->fLeftChild      = usetNode;
    usetNode->fText = s;


    //
    // Add the new uset node to the list of all uset nodes.
    //
    fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus);


    //
    // Add the new set to the set hash table.
    //
    el      = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl));
    UnicodeString *tkey = new UnicodeString(s);
    if (tkey == NULL || el == NULL || setToAdopt == NULL) {
        // Delete to avoid memory leak
        delete tkey;
        tkey = NULL;
        uprv_free(el);
        el = NULL;
        delete setToAdopt;
        setToAdopt = NULL;

        error(U_MEMORY_ALLOCATION_ERROR);
        return;
    }
    el->key = tkey;
    el->val = usetNode;
    uhash_put(fSetTable, el->key, el, fRB->fStatus);

    return;
}
Beispiel #19
0
U_CFUNC int32_t
u_strFromPunycode(const UChar *src, int32_t srcLength,
                  UChar *dest, int32_t destCapacity,
                  UBool *caseFlags,
                  UErrorCode *pErrorCode) {
    int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
            destCPCount, firstSupplementaryIndex, cpLength;
    UChar b;

    /* argument checking */
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }

    if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    if(srcLength==-1) {
        srcLength=u_strlen(src);
    }

    /*
     * Handle the basic code points:
     * Let basicLength be the number of input code points
     * before the last delimiter, or 0 if there is none,
     * then copy the first basicLength code points to the output.
     *
     * The two following loops iterate backward.
     */
    for(j=srcLength; j>0;) {
        if(src[--j]==DELIMITER) {
            break;
        }
    }
    destLength=basicLength=destCPCount=j;
    U_ASSERT(destLength>=0);

    while(j>0) {
        b=src[--j];
        if(!IS_BASIC(b)) {
            *pErrorCode=U_INVALID_CHAR_FOUND;
            return 0;
        }

        if(j<destCapacity) {
            dest[j]=(UChar)b;

            if(caseFlags!=NULL) {
                caseFlags[j]=IS_BASIC_UPPERCASE(b);
            }
        }
    }

    /* Initialize the state: */
    n=INITIAL_N;
    i=0;
    bias=INITIAL_BIAS;
    firstSupplementaryIndex=1000000000;

    /*
     * Main decoding loop:
     * Start just after the last delimiter if any
     * basic code points were copied; start at the beginning otherwise.
     */
    for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
        /*
         * in is the index of the next character to be consumed, and
         * destCPCount is the number of code points in the output array.
         *
         * Decode a generalized variable-length integer into delta,
         * which gets added to i.  The overflow checking is easier
         * if we increase i as we go, then subtract off its starting
         * value at the end to obtain delta.
         */
        for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
            if(in>=srcLength) {
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                return 0;
            }

            digit=basicToDigit[(uint8_t)src[in++]];
            if(digit<0) {
                *pErrorCode=U_INVALID_CHAR_FOUND;
                return 0;
            }
            if(digit>(0x7fffffff-i)/w) {
                /* integer overflow */
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                return 0;
            }

            i+=digit*w;
            /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt  
            t=k-bias;
            if(t<TMIN) {
                t=TMIN;
            } else if(t>TMAX) {
                t=TMAX;
            }
            */
            t=k-bias;
            if(t<TMIN) {
                t=TMIN;
            } else if(k>=(bias+TMAX)) {
                t=TMAX;
            }
            if(digit<t) {
                break;
            }

            if(w>0x7fffffff/(BASE-t)) {
                /* integer overflow */
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                return 0;
            }
            w*=BASE-t;
        }

        /*
         * Modification from sample code:
         * Increments destCPCount here,
         * where needed instead of in for() loop tail.
         */
        ++destCPCount;
        bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));

        /*
         * i was supposed to wrap around from (incremented) destCPCount to 0,
         * incrementing n each time, so we'll fix that now:
         */
        if(i/destCPCount>(0x7fffffff-n)) {
            /* integer overflow */
            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
            return 0;
        }

        n+=i/destCPCount;
        i%=destCPCount;
        /* not needed for Punycode: */
        /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */

        if(n>0x10ffff || U_IS_SURROGATE(n)) {
            /* Unicode code point overflow */
            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
            return 0;
        }

        /* Insert n at position i of the output: */
        cpLength=U16_LENGTH(n);
        if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) {
            int32_t codeUnitIndex;

            /*
             * Handle indexes when supplementary code points are present.
             *
             * In almost all cases, there will be only BMP code points before i
             * and even in the entire string.
             * This is handled with the same efficiency as with UTF-32.
             *
             * Only the rare cases with supplementary code points are handled
             * more slowly - but not too bad since this is an insertion anyway.
             */
            if(i<=firstSupplementaryIndex) {
                codeUnitIndex=i;
                if(cpLength>1) {
                    firstSupplementaryIndex=codeUnitIndex;
                } else {
                    ++firstSupplementaryIndex;
                }
            } else {
                codeUnitIndex=firstSupplementaryIndex;
                U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
            }

            /* use the UChar index codeUnitIndex instead of the code point index i */
            if(codeUnitIndex<destLength) {
                uprv_memmove(dest+codeUnitIndex+cpLength,
                             dest+codeUnitIndex,
                             (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
                if(caseFlags!=NULL) {
                    uprv_memmove(caseFlags+codeUnitIndex+cpLength,
                                 caseFlags+codeUnitIndex,
                                 destLength-codeUnitIndex);
                }
            }
            if(cpLength==1) {
                /* BMP, insert one code unit */
                dest[codeUnitIndex]=(UChar)n;
            } else {
                /* supplementary character, insert two code units */
                dest[codeUnitIndex]=U16_LEAD(n);
                dest[codeUnitIndex+1]=U16_TRAIL(n);
            }
            if(caseFlags!=NULL) {
                /* Case of last character determines uppercase flag: */
                caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
                if(cpLength==2) {
                    caseFlags[codeUnitIndex+1]=FALSE;
                }
            }
        }
        destLength+=cpLength;
        U_ASSERT(destLength>=0);
        ++i;
    }

    return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}
Beispiel #20
0
//------------------------------------------------------------------------------
//
//  Parse RBBI rules.   The state machine for rules parsing is here.
//                      The state tables are hand-written in the file rbbirpt.txt,
//                      and converted to the form used here by a perl
//                      script rbbicst.pl
//
//------------------------------------------------------------------------------
void RBBIRuleScanner::parse() {
    uint16_t                state;
    const RBBIRuleTableEl  *tableEl;

    if (U_FAILURE(*fRB->fStatus)) {
        return;
    }

    state = 1;
    nextChar(fC);
    //
    // Main loop for the rule parsing state machine.
    //   Runs once per state transition.
    //   Each time through optionally performs, depending on the state table,
    //      - an advance to the the next input char
    //      - an action to be performed.
    //      - pushing or popping a state to/from the local state return stack.
    //
    for (;;) {
        //  Bail out if anything has gone wrong.
        //  RBBI rule file parsing stops on the first error encountered.
        if (U_FAILURE(*fRB->fStatus)) {
            break;
        }

        // Quit if state == 0.  This is the normal way to exit the state machine.
        //
        if (state == 0) {
            break;
        }

        // Find the state table element that matches the input char from the rule, or the
        //    class of the input character.  Start with the first table row for this
        //    state, then linearly scan forward until we find a row that matches the
        //    character.  The last row for each state always matches all characters, so
        //    the search will stop there, if not before.
        //
        tableEl = &gRuleParseStateTable[state];
        #ifdef RBBI_DEBUG
            if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
                RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d)    state=%s ",
                    fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);
            }
        #endif

        for (;;) {
            #ifdef RBBI_DEBUG
                if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("."); fflush(stdout);}
            #endif
            if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE &&   tableEl->fCharClass == fC.fChar) {
                // Table row specified an individual character, not a set, and
                //   the input character is not escaped, and
                //   the input character matched it.
                break;
            }
            if (tableEl->fCharClass == 255) {
                // Table row specified default, match anything character class.
                break;
            }
            if (tableEl->fCharClass == 254 && fC.fEscaped)  {
                // Table row specified "escaped" and the char was escaped.
                break;
            }
            if (tableEl->fCharClass == 253 && fC.fEscaped &&
                (fC.fChar == 0x50 || fC.fChar == 0x70 ))  {
                // Table row specified "escaped P" and the char is either 'p' or 'P'.
                break;
            }
            if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1)  {
                // Table row specified eof and we hit eof on the input.
                break;
            }

            if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
                fC.fEscaped == FALSE &&                                      //   char is not escaped &&
                fC.fChar != (UChar32)-1) {                                   //   char is not EOF
                U_ASSERT((tableEl->fCharClass-128) < UPRV_LENGTHOF(fRuleSets));
                if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
                    // Table row specified a character class, or set of characters,
                    //   and the current char matches it.
                    break;
                }
            }

            // No match on this row, advance to the next  row for this state,
            tableEl++;
        }
        if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");}

        //
        // We've found the row of the state table that matches the current input
        //   character from the rules string.
        // Perform any action specified  by this row in the state table.
        if (doParseActions((int32_t)tableEl->fAction) == FALSE) {
            // Break out of the state machine loop if the
            //   the action signalled some kind of error, or
            //   the action was to exit, occurs on normal end-of-rules-input.
            break;
        }

        if (tableEl->fPushState != 0) {
            fStackPtr++;
            if (fStackPtr >= kStackSize) {
                error(U_BRK_INTERNAL_ERROR);
                RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow.");
                fStackPtr--;
            }
            fStack[fStackPtr] = tableEl->fPushState;
        }

        if (tableEl->fNextChar) {
            nextChar(fC);
        }

        // Get the next state from the table entry, or from the
        //   state stack if the next state was specified as "pop".
        if (tableEl->fNextState != 255) {
            state = tableEl->fNextState;
        } else {
            state = fStack[fStackPtr];
            fStackPtr--;
            if (fStackPtr < 0) {
                error(U_BRK_INTERNAL_ERROR);
                RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow.");
                fStackPtr++;
            }
        }

    }

    if (U_FAILURE(*fRB->fStatus)) {
        return;
    }
    
    // If there are no forward rules set an error.
    //
    if (fRB->fForwardTree == NULL) {
        error(U_BRK_RULE_SYNTAX);
        return;
    }

    //
    // If there were NO user specified reverse rules, set up the equivalent of ".*;"
    //
    if (fRB->fReverseTree == NULL) {
        fRB->fReverseTree  = pushNewNode(RBBINode::opStar);
        RBBINode  *operand = pushNewNode(RBBINode::setRef);
        if (U_FAILURE(*fRB->fStatus)) {
            return;
        }
        findSetFor(UnicodeString(TRUE, kAny, 3), operand);
        fRB->fReverseTree->fLeftChild = operand;
        operand->fParent              = fRB->fReverseTree;
        fNodeStackPtr -= 2;
    }


    //
    // Parsing of the input RBBI rules is complete.
    // We now have a parse tree for the rule expressions
    // and a list of all UnicodeSets that are referenced.
    //
#ifdef RBBI_DEBUG
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();}
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree"))
    {
        RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");
        fRB->fForwardTree->printTree(TRUE);
        RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");
        fRB->fReverseTree->printTree(TRUE);
        RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");
        fRB->fSafeFwdTree->printTree(TRUE);
        RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");
        fRB->fSafeRevTree->printTree(TRUE);
    }
#endif
}
Beispiel #21
0
void uRelease(uObject* object)
{
    if (object)
    {
        if (Xli::AtomicDecrement(&object->__retains) == 0)
        {
            if (!uTryClearWeak(object))
                return;
#ifdef DEBUG_ARC
            uThreadData* thread = uGetThreadData();

            if (thread->AutoReleasePtr >= thread->AutoReleaseStack)
            {
                uAutoReleaseFrame* frame = thread->AutoReleasePtr;
                if (frame->AllocCount > 0)
                {
                    frame->FreeCount++;
                    frame->FreeSize += object->__size;
                }
            }
#endif
            uType* type = object->__type;

            switch (type->Type)
            {
            case uTypeTypeClass:
            {
                uType* baseType = type;
                do
                {
                    if (baseType->fp_Finalize)
                    {
                        try { (*baseType->fp_Finalize)(object); }
                        catch (...) { Xli::Error->WriteFormat("Runtime Error: Unhandled exception in finalizer for %s\n", baseType->FullName); }
                    }
                } while ((baseType = baseType->Base));
                uReleaseStruct(type, object);
                break;
            }

            case uTypeTypeStruct:
                // This must be a boxed value, so append size of object header
                uReleaseStruct(type, (uint8_t*)object + sizeof(uObject));
                break;

            case uTypeTypeDelegate:
                uRelease(((uDelegate*)object)->_object);
                uRelease(((uDelegate*)object)->_prev);
                break;

            case uTypeTypeArray:
            {
                uArray* array = (uArray*)object;
                uArrayType* arrayType = (uArrayType*)type;
                uType* elmType = arrayType->ElementType;

                switch (elmType->Type)
                {
                case uTypeTypeClass:
                case uTypeTypeInterface:
                case uTypeTypeDelegate:
                case uTypeTypeArray:
                    for (uObject** objAddr = (uObject**)array->_ptr;
                         array->_length--;
                         objAddr++)
                        uRelease(*objAddr);
                    break;

                case uTypeTypeStruct:
                    for (uint8_t* address = (uint8_t*)array->_ptr;
                         array->_length--;
                         address += elmType->ValueSize)
                        uReleaseStruct(elmType, address);
                    break;

                default:
                    break;
                }
                break;
            }
            default:
                break;
            }
#if DEBUG_ARC >= 2
            Xli::Error->WriteFormat("free %s #%d (%d bytes)%s\n", object->__type->FullName, object->__id, object->__size, uGetCaller().Ptr());
#endif
#ifdef DEBUG_DUMPS
            uEnterCritical();
            _HeapObjects->Remove(object);
            uExitCritical();
#endif
            U_ASSERT(object->__type != ::g::Uno::Type_typeof());
            U_FREE_OBJECT(object);
            return;
        }

        if (object->__retains < 0)
        {
#if DEBUG_ARC >= 4
            Xli::Error->WriteFormat("*** BAD OBJECT: %s #%d (%d retains) ***%s\n", object->__type->FullName, object->__id, object->__retains, uGetCaller().Ptr());
#else
            Xli::Error->WriteFormat("*** BAD OBJECT: 0x%llx ***\n", (uintptr_t)object);
#endif
            U_FATAL();
        }
        else
        {
#if DEBUG_ARC >= 3
            Xli::Error->WriteFormat("release %s #%d (%d bytes, %d retains)%s\n", object->__type->FullName, object->__id, object->__size, object->__retains, uGetCaller().Ptr());
#endif
        }
    }
}
static void U_CALLCONV initNumberFormatService() {
    U_ASSERT(gService == NULL);
    ucln_i18n_registerCleanup(UCLN_I18N_NUMFMT, numfmt_cleanup);
    gService = new ICUNumberFormatService();
}
Beispiel #23
0
uObject* uNew(uType* type, size_t size)
{
    U_ASSERT(type && size);
    return uInitObject(type, U_MALLOC_OBJECT(size, type), size);
}
Beispiel #24
0
int
U_EXPORT main (int argc, char* argv[])
{
   U_ULIB_INIT(argv);

   U_TRACE(5,"main(%d)",argc)

   UTimeDate data1(31,12,99), data2("31/12/99");

   U_ASSERT( UTimeDate("14/09/1752").getJulian() == 2361222 )
   U_ASSERT( UTimeDate("31/12/1900").getJulian() == 2415385 )
   U_ASSERT( UTimeDate("01/01/1970").getJulian() == 2440588 )

   U_ASSERT( data1 == data2 )
   U_ASSERT( data1.getDayOfWeek() == 5 ) // Venerdi
   U_ASSERT( data2.getDayOfYear() == 365 )

   U_ASSERT( UTimeDate("1/3/00").getDayOfWeek() == 3 ) // Mercoledi
   U_ASSERT( UTimeDate(31,12,0).getDayOfYear() == 366 )

   UTimeDate data3(60,2000);
   UTimeDate data4("29/02/00");

   U_ASSERT( data3 == data4 )
   U_ASSERT( data3.getDayOfYear() == 60 )

   UTimeDate data5(60,1901);
   UTimeDate data6("1/3/1901");

   U_ASSERT( data5 == data6 )

   U_ASSERT( UTimeDate(17, 5, 2002).isValid() == true )  // TRUE   May 17th 2002 is valid
   U_ASSERT( UTimeDate(30, 2, 2002).isValid() == false ) // FALSE  Feb 30th does not exist
   U_ASSERT( UTimeDate(29, 2, 2004).isValid() == true )  // TRUE   2004 is a leap year

   UTimeDate data7(29, 2, 2004);

   UString x = data7.strftime("%Y-%m-%d");

   U_ASSERT( x == U_STRING_FROM_CONSTANT("2004-02-29") )

   U_ASSERT( UTimeDate("14/09/1752").getJulian() == 2361222 )

   cout << "Date: " << data6.strftime("%d/%m/%y") << '\n';

   while (cin >> data6) cout << data6 << '\n';

   U_ASSERT( UTimeDate::getSecondFromTime("19030314104248Z", true, "%4u%2u%2u%2u%2u%2uZ") < u_now->tv_sec )

   /*
   typedef struct static_date {
      struct timeval _timeval;      // => u_now
      char lock1[1];
      char date1[17+1];             // 18/06/12 18:45:56
      char lock2[1];
      char date2[26+1];             // 04/Jun/2012:18:18:37 +0200
      char lock3[1];
      char date3[6+29+2+12+2+19+1]; // Date: Wed, 20 Jun 2012 11:43:17 GMT\r\nServer: ULib\r\nConnection: close\r\n
   } static_date;
   */

   ULog::static_date log_data;

   (void) u_strftime2(log_data.date1, 17,               "%d/%m/%y %T",    u_now->tv_sec + u_now_adjust);
   (void) u_strftime2(log_data.date2, 26,               "%d/%b/%Y:%T %z", u_now->tv_sec + u_now_adjust);
   (void) u_strftime2(log_data.date3, 6+29+2+12+2+17+2, "Date: %a, %d %b %Y %T GMT\r\nServer: ULib\r\nConnection: close\r\n", u_now->tv_sec);

   U_INTERNAL_DUMP("date1 = %.17S date2 = %.26S date3+6 = %.29S", log_data.date1, log_data.date2, log_data.date3+6)

   /*
   for (int i = 0; i < 360; ++i)
      {
      u_now->tv_sec++;

      UTimeDate::updateTime(log_data.date1   + 12);
      UTimeDate::updateTime(log_data.date2   + 15);
      UTimeDate::updateTime(log_data.date3+6 + 20);

      cout.write(log_data.date1, 17);
      cout.write(" - ", 3);
      cout.write(log_data.date2, 26);
      cout.write(" - ", 3);
      cout.write(log_data.date3+6, 29);
      cout.put('\n');
      }
   */
}
Beispiel #25
0
/*
 * This method updates the cache and must be called with a lock
 */
const UChar*
TZGNCore::getGenericLocationName(const UnicodeString& tzCanonicalID) {
    U_ASSERT(!tzCanonicalID.isEmpty());
    if (tzCanonicalID.length() > ZID_KEY_MAX) {
        return NULL;
    }

    UErrorCode status = U_ZERO_ERROR;
    UChar tzIDKey[ZID_KEY_MAX + 1];
    int32_t tzIDKeyLen = tzCanonicalID.extract(tzIDKey, ZID_KEY_MAX + 1, status);
    U_ASSERT(status == U_ZERO_ERROR);   // already checked length above
    tzIDKey[tzIDKeyLen] = 0;

    const UChar *locname = (const UChar *)uhash_get(fLocationNamesMap, tzIDKey);

    if (locname != NULL) {
        // gEmpty indicate the name is not available
        if (locname == gEmpty) {
            return NULL;
        }
        return locname;
    }

    // Construct location name
    UnicodeString name;
    UnicodeString usCountryCode;
    UBool isPrimary = FALSE;

    ZoneMeta::getCanonicalCountry(tzCanonicalID, usCountryCode, &isPrimary);

    if (!usCountryCode.isEmpty()) {
        if (isPrimary) {
            // If this is the primary zone in the country, use the country name.
            char countryCode[ULOC_COUNTRY_CAPACITY];
            U_ASSERT(usCountryCode.length() < ULOC_COUNTRY_CAPACITY);
            int32_t ccLen = usCountryCode.extract(0, usCountryCode.length(), countryCode, sizeof(countryCode), US_INV);
            countryCode[ccLen] = 0;

            UnicodeString country;
            fLocaleDisplayNames->regionDisplayName(countryCode, country);
            fRegionFormat.format(country, name, status);
        } else {
            // If this is not the primary zone in the country,
            // use the exemplar city name.

            // getExemplarLocationName should retur non-empty string
            // if the time zone is associated with a region

            UnicodeString city;
            fTimeZoneNames->getExemplarLocationName(tzCanonicalID, city);
            fRegionFormat.format(city, name, status);
        }
        if (U_FAILURE(status)) {
            return NULL;
        }
    }

    locname = name.isEmpty() ? NULL : fStringPool.get(name, status);
    if (U_SUCCESS(status)) {
        // Cache the result
        const UChar* cacheID = ZoneMeta::findTimeZoneID(tzCanonicalID);
        U_ASSERT(cacheID != NULL);
        if (locname == NULL) {
            // gEmpty to indicate - no location name available
            uhash_put(fLocationNamesMap, (void *)cacheID, (void *)gEmpty, &status);
        } else {
            uhash_put(fLocationNamesMap, (void *)cacheID, (void *)locname, &status);
            if (U_FAILURE(status)) {
                locname = NULL;
            } else {
                // put the name info into the trie
                GNameInfo *nameinfo = (ZNameInfo *)uprv_malloc(sizeof(GNameInfo));
                if (nameinfo != NULL) {
                    nameinfo->type = UTZGNM_LOCATION;
                    nameinfo->tzID = cacheID;
                    fGNamesTrie.put(locname, nameinfo, status);
                }
            }
        }
    }

    return locname;
}
Beispiel #26
0
//---------------------------------------------------------------------
//
//   dump    Output the compiled form of the pattern.
//           Debugging function only.
//
//---------------------------------------------------------------------
void   RegexPattern::dumpOp(int32_t index) const {
    (void)index;  // Suppress warnings in non-debug build.
#if defined(REGEX_DEBUG)
    static const char * const opNames[] = {URX_OPCODE_NAMES};
    int32_t op          = fCompiledPat->elementAti(index);
    int32_t val         = URX_VAL(op);
    int32_t type        = URX_TYPE(op);
    int32_t pinnedType  = type;
    if ((uint32_t)pinnedType >= UPRV_LENGTHOF(opNames)) {
        pinnedType = 0;
    }

    printf("%4d   %08x    %-15s  ", index, op, opNames[pinnedType]);
    switch (type) {
    case URX_NOP:
    case URX_DOTANY:
    case URX_DOTANY_ALL:
    case URX_FAIL:
    case URX_CARET:
    case URX_DOLLAR:
    case URX_BACKSLASH_G:
    case URX_BACKSLASH_X:
    case URX_END:
    case URX_DOLLAR_M:
    case URX_CARET_M:
        // Types with no operand field of interest.
        break;

    case URX_RESERVED_OP:
    case URX_START_CAPTURE:
    case URX_END_CAPTURE:
    case URX_STATE_SAVE:
    case URX_JMP:
    case URX_JMP_SAV:
    case URX_JMP_SAV_X:
    case URX_BACKSLASH_B:
    case URX_BACKSLASH_BU:
    case URX_BACKSLASH_D:
    case URX_BACKSLASH_Z:
    case URX_STRING_LEN:
    case URX_CTR_INIT:
    case URX_CTR_INIT_NG:
    case URX_CTR_LOOP:
    case URX_CTR_LOOP_NG:
    case URX_RELOC_OPRND:
    case URX_STO_SP:
    case URX_LD_SP:
    case URX_BACKREF:
    case URX_STO_INP_LOC:
    case URX_JMPX:
    case URX_LA_START:
    case URX_LA_END:
    case URX_BACKREF_I:
    case URX_LB_START:
    case URX_LB_CONT:
    case URX_LB_END:
    case URX_LBN_CONT:
    case URX_LBN_END:
    case URX_LOOP_C:
    case URX_LOOP_DOT_I:
    case URX_BACKSLASH_H:
    case URX_BACKSLASH_R:
    case URX_BACKSLASH_V:
        // types with an integer operand field.
        printf("%d", val);
        break;

    case URX_ONECHAR:
    case URX_ONECHAR_I:
        if (val < 0x20) {
            printf("%#x", val);
        } else {
            printf("'%s'", CStr(UnicodeString(val))());
        }
        break;

    case URX_STRING:
    case URX_STRING_I:
        {
            int32_t lengthOp       = fCompiledPat->elementAti(index+1);
            U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
            int32_t length = URX_VAL(lengthOp);
            UnicodeString str(fLiteralText, val, length);
            printf("%s", CStr(str)());
        }
        break;

    case URX_SETREF:
    case URX_LOOP_SR_I:
        {
            UnicodeString s;
            UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
            set->toPattern(s, TRUE);
            printf("%s", CStr(s)());
        }
        break;

    case URX_STATIC_SETREF:
    case URX_STAT_SETREF_N:
        {
            UnicodeString s;
            if (val & URX_NEG_SET) {
                printf("NOT ");
                val &= ~URX_NEG_SET;
            }
            UnicodeSet *set = fStaticSets[val];
            set->toPattern(s, TRUE);
            printf("%s", CStr(s)());
        }
        break;


    default:
        printf("??????");
        break;
    }
    printf("\n");
#endif
}
Beispiel #27
0
/*
 * This method updates the cache and must be called with a lock
 */
const UChar*
TZGNCore::getPartialLocationName(const UnicodeString& tzCanonicalID,
                        const UnicodeString& mzID, UBool isLong, const UnicodeString& mzDisplayName) {
    U_ASSERT(!tzCanonicalID.isEmpty());
    U_ASSERT(!mzID.isEmpty());
    U_ASSERT(!mzDisplayName.isEmpty());

    PartialLocationKey key;
    key.tzID = ZoneMeta::findTimeZoneID(tzCanonicalID);
    key.mzID = ZoneMeta::findMetaZoneID(mzID);
    key.isLong = isLong;
    U_ASSERT(key.tzID != NULL && key.mzID != NULL);

    const UChar* uplname = (const UChar*)uhash_get(fPartialLocationNamesMap, (void *)&key);
    if (uplname != NULL) {
        return uplname;
    }

    UnicodeString location;
    UnicodeString usCountryCode;
    ZoneMeta::getCanonicalCountry(tzCanonicalID, usCountryCode);
    if (!usCountryCode.isEmpty()) {
        char countryCode[ULOC_COUNTRY_CAPACITY];
        U_ASSERT(usCountryCode.length() < ULOC_COUNTRY_CAPACITY);
        int32_t ccLen = usCountryCode.extract(0, usCountryCode.length(), countryCode, sizeof(countryCode), US_INV);
        countryCode[ccLen] = 0;

        UnicodeString regionalGolden;
        fTimeZoneNames->getReferenceZoneID(mzID, countryCode, regionalGolden);
        if (tzCanonicalID == regionalGolden) {
            // Use country name
            fLocaleDisplayNames->regionDisplayName(countryCode, location);
        } else {
            // Otherwise, use exemplar city name
            fTimeZoneNames->getExemplarLocationName(tzCanonicalID, location);
        }
    } else {
        fTimeZoneNames->getExemplarLocationName(tzCanonicalID, location);
        if (location.isEmpty()) {
            // This could happen when the time zone is not associated with a country,
            // and its ID is not hierarchical, for example, CST6CDT.
            // We use the canonical ID itself as the location for this case.
            location.setTo(tzCanonicalID);
        }
    }

    UErrorCode status = U_ZERO_ERROR;
    UnicodeString name;
    fFallbackFormat.format(location, mzDisplayName, name, status);
    if (U_FAILURE(status)) {
        return NULL;
    }

    uplname = fStringPool.get(name, status);
    if (U_SUCCESS(status)) {
        // Add the name to cache
        PartialLocationKey* cacheKey = (PartialLocationKey *)uprv_malloc(sizeof(PartialLocationKey));
        if (cacheKey != NULL) {
            cacheKey->tzID = key.tzID;
            cacheKey->mzID = key.mzID;
            cacheKey->isLong = key.isLong;
            uhash_put(fPartialLocationNamesMap, (void *)cacheKey, (void *)uplname, &status);
            if (U_FAILURE(status)) {
                uprv_free(cacheKey);
            } else {
                // put the name to the local trie as well
                GNameInfo *nameinfo = (ZNameInfo *)uprv_malloc(sizeof(GNameInfo));
                if (nameinfo != NULL) {
                    nameinfo->type = isLong ? UTZGNM_LONG : UTZGNM_SHORT;
                    nameinfo->tzID = key.tzID;
                    fGNamesTrie.put(uplname, nameinfo, status);
                }
            }
        }
    }
    return uplname;
}
Beispiel #28
0
//-----------------------------------------------------------------------------
//
//   calcChainedFollowPos.    Modify the previously calculated followPos sets
//                            to implement rule chaining.  NOT described by Aho
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {

    UVector         endMarkerNodes(*fStatus);
    UVector         leafNodes(*fStatus);
    int32_t         i;

    if (U_FAILURE(*fStatus)) {
        return;
    }

    // get a list of all endmarker nodes.
    tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);

    // get a list all leaf nodes
    tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
    if (U_FAILURE(*fStatus)) {
        return;
    }

    // Get all nodes that can be the start a match, which is FirstPosition()
    // of the portion of the tree corresponding to user-written rules.
    // See the tree description in bofFixup().
    RBBINode *userRuleRoot = tree;
    if (fRB->fSetBuilder->sawBOF()) {
        userRuleRoot = tree->fLeftChild->fRightChild;
    }
    U_ASSERT(userRuleRoot != NULL);
    UVector *matchStartNodes = userRuleRoot->fFirstPosSet;


    // Iteratate over all leaf nodes,
    //
    int32_t  endNodeIx;
    int32_t  startNodeIx;

    for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
        RBBINode *tNode   = (RBBINode *)leafNodes.elementAt(endNodeIx);
        RBBINode *endNode = NULL;

        // Identify leaf nodes that correspond to overall rule match positions.
        //   These include an endMarkerNode in their followPos sets.
        for (i=0; i<endMarkerNodes.size(); i++) {
            if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
                endNode = tNode;
                break;
            }
        }
        if (endNode == NULL) {
            // node wasn't an end node.  Try again with the next.
            continue;
        }

        // We've got a node that can end a match.

        // Line Break Specific hack:  If this node's val correspond to the $CM char class,
        //                            don't chain from it.
        // TODO:  Add rule syntax for this behavior, get specifics out of here and
        //        into the rule file.
        if (fRB->fLBCMNoChain) {
            UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
            if (c != -1) {
                // c == -1 occurs with sets containing only the {eof} marker string.
                ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
                if (cLBProp == U_LB_COMBINING_MARK) {
                    continue;
                }
            }
        }


        // Now iterate over the nodes that can start a match, looking for ones
        //   with the same char class as our ending node.
        RBBINode *startNode;
        for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
            startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
            if (startNode->fType != RBBINode::leafChar) {
                continue;
            }

            if (endNode->fVal == startNode->fVal) {
                // The end val (character class) of one possible match is the
                //   same as the start of another.

                // Add all nodes from the followPos of the start node to the
                //  followPos set of the end node, which will have the effect of
                //  letting matches transition from a match state at endNode
                //  to the second char of a match starting with startNode.
                setAdd(endNode->fFollowPos, startNode->fFollowPos);
            }
        }
    }
}
Beispiel #29
0
UErrorCode convsample_06()
{
  printf("\n\n==============================================\n"
         "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");

  FILE *f;
  int32_t count;
  char inBuf[BUFFERSIZE];
  const char *source;
  const char *sourceLimit;
  int32_t uBufSize = 0;
  UConverter *conv;
  UErrorCode status = U_ZERO_ERROR;
  uint32_t letters=0, total=0;

  CharFreqInfo   *info;
  UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
  UChar32   p;

  uint32_t ie = 0;
  uint32_t gh = 0;
  UChar32 l = 0;

  f = fopen("data06.txt", "r");
  if(!f)
  {
    fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
    return U_FILE_ACCESS_ERROR;
  }

  info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
  if(!info)
  {
    fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
  }

  /* reset frequencies */
  for(p=0;p<charCount;p++)
  {
    info[p].codepoint = p;
    info[p].frequency = 0;
  }

  // **************************** START SAMPLE *******************
  conv = ucnv_open("utf-8", &status);
  assert(U_SUCCESS(status));

  uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
  printf("input bytes %d / min chars %d = %d UChars\n",
         BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);

  // grab another buffer's worth
  while((!feof(f)) && 
        ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
  {
    // Convert bytes to unicode
    source = inBuf;
    sourceLimit = inBuf + count;
    
    while(source < sourceLimit)
    {
      p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
      if(U_FAILURE(status))
      {
        fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
        status = U_ZERO_ERROR;
        continue;
      }
      U_ASSERT(status);
      total++;

      if(u_isalpha(p))
        letters++;

      if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
        ie++;

      if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
        gh++;

      if(p>charCount)
      {
        fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
        free(info);
        fclose(f);
        ucnv_close(conv);
        return U_UNSUPPORTED_ERROR;
      }
      info[p].frequency++;
      l = p;
    }
  }

  fclose(f);
  ucnv_close(conv);

  printf("%d letters out of %d total UChars.\n", letters, total);
  printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);

  // now, we could sort it..

  //  qsort(info, charCount, sizeof(info[0]), charfreq_compare);

  for(p=0;p<charCount;p++)
  {
    if(info[p].frequency)
    {
      printf("% 5d U+%06X ", info[p].frequency, p);
      if(p <= 0xFFFF)
      {
        prettyPrintUChar((UChar)p);
      }
      printf("\n");
    }
  }
  free(info);
  // ***************************** END SAMPLE ********************

  printf("\n");

  return U_ZERO_ERROR;
}
Beispiel #30
0
/**
 * Currently, getDouble() depends on atof() to do its conversion.
 *
 * WARNING!!
 * This is an extremely costly function. ~1/2 of the conversion time
 * can be linked to this function.
 */
double
DigitList::getDouble() const
{
    // TODO:  fix thread safety.  Can probably be finessed some by analyzing
    //        what public const functions can see which DigitLists.
    //        Like precompute fDouble for DigitLists coming in from a parse
    //        or from a Formattable::set(), but not for any others.
    if (fHaveDouble) {
        return fDouble;
    }
    DigitList *nonConstThis = const_cast<DigitList *>(this);

    if (gDecimal == 0) {
        char rep[MAX_DIGITS];
        // For machines that decide to change the decimal on you,
        // and try to be too smart with localization.
        // This normally should be just a '.'.
        sprintf(rep, "%+1.1f", 1.0);
        gDecimal = rep[2];
    }

    if (isZero()) {
        nonConstThis->fDouble = 0.0;
        if (decNumberIsNegative(fDecNumber)) {
            nonConstThis->fDouble /= -1;
        }
    } else if (isInfinite()) {
        if (std::numeric_limits<double>::has_infinity) {
            nonConstThis->fDouble = std::numeric_limits<double>::infinity();
        } else {
            nonConstThis->fDouble = std::numeric_limits<double>::max();
        }
        if (!isPositive()) {
            nonConstThis->fDouble = -fDouble;
        } 
    } else {
        MaybeStackArray<char, MAX_DBL_DIGITS+18> s;
           // Note:  14 is a  magic constant from the decNumber library documentation,
           //        the max number of extra characters beyond the number of digits 
           //        needed to represent the number in string form.  Add a few more
           //        for the additional digits we retain.

        // Round down to appx. double precision, if the number is longer than that.
        // Copy the number first, so that we don't modify the original.
        if (getCount() > MAX_DBL_DIGITS + 3) {
            DigitList numToConvert(*this);
            numToConvert.reduce();    // Removes any trailing zeros, so that digit count is good.
            numToConvert.round(MAX_DBL_DIGITS+3);
            uprv_decNumberToString(numToConvert.fDecNumber, s);
            // TODO:  how many extra digits should be included for an accurate conversion?
        } else {
            uprv_decNumberToString(this->fDecNumber, s);
        }
        U_ASSERT(uprv_strlen(&s[0]) < MAX_DBL_DIGITS+18);
        
        if (gDecimal != '.') {
            char *decimalPt = strchr(s, '.');
            if (decimalPt != NULL) {
                *decimalPt = gDecimal;
            }
        }
        char *end = NULL;
        nonConstThis->fDouble = uprv_strtod(s, &end);
    }
    nonConstThis->fHaveDouble = TRUE;
    return fDouble;
}