Example #1
0
Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {

    if (source == targetScript || source == USCRIPT_INVALID_CODE) {
        return NULL;
    }

    Transliterator* t = NULL;
    {
        Mutex m(NULL);
        t = (Transliterator*) uhash_iget(cache, (int32_t) source);
    }
    if (t == NULL) {
        UErrorCode ec = U_ZERO_ERROR;
        UnicodeString sourceName(uscript_getShortName(source), -1, US_INV);
        UnicodeString id(sourceName);
        id.append(TARGET_SEP).append(target);

        t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
        if (U_FAILURE(ec) || t == NULL) {
            delete t;

            // Try to pivot around Latin, our most common script
            id = sourceName;
            id.append(LATIN_PIVOT, -1).append(target);
            t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
            if (U_FAILURE(ec) || t == NULL) {
                delete t;
                t = NULL;
            }
        }

        if (t != NULL) {
            Transliterator *rt = NULL;
            {
                Mutex m(NULL);
                rt = static_cast<Transliterator *> (uhash_iget(cache, (int32_t) source));
                if (rt == NULL) {
                    // Common case, no race to cache this new transliterator.
                    uhash_iput(cache, (int32_t) source, t, &ec);
                } else {
                    // Race case, some other thread beat us to caching this transliterator.
                    Transliterator *temp = rt;
                    rt = t;    // Our newly created transliterator that lost the race & now needs deleting.
                    t  = temp; // The transliterator from the cache that we will return.
                }
            }
            delete rt;    // will be non-null only in case of races.
        }
    }
    return t;
}
Example #2
0
Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {

    if (source == targetScript || source == USCRIPT_INVALID_CODE) {
        return NULL;
    }

    Transliterator* t = (Transliterator*) uhash_iget(cache, (int32_t) source);
    if (t == NULL) {
        UErrorCode ec = U_ZERO_ERROR;
        UnicodeString sourceName(uscript_getName(source), -1, US_INV);
        UnicodeString id(sourceName);
        id.append(TARGET_SEP).append(target);
        
        t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
        if (U_FAILURE(ec) || t == NULL) {
            delete t;
            
            // Try to pivot around Latin, our most common script
            id = sourceName;
            id.append(LATIN_PIVOT, -1).append(target);
            t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
            if (U_FAILURE(ec) || t == NULL) {
                delete t;
                t = NULL;
            }
        }

        if (t != NULL) {
            uhash_iput(cache, (int32_t) source, t, &ec);
        }
    }

    return t;
}
Example #3
0
/**
 * Returns a pointer to a Region using the given numeric region code. If the numeric region code is not recognized,
 * the appropriate error code will be set ( U_ILLEGAL_ARGUMENT_ERROR ).
 */
const Region* U_EXPORT2 
Region::getInstance (int32_t code, UErrorCode &status) {

    loadRegionData();

    Region *r = (Region *)uhash_iget(numericCodeMap,code);

    if ( !r ) { // Just in case there's an alias that's numeric, try to find it.
        UErrorCode fs = U_ZERO_ERROR;
        UnicodeString pat = UNICODE_STRING_SIMPLE("00#");
        DecimalFormat *df = new DecimalFormat(pat,fs);
        
        UnicodeString id;
        id.remove();
        df->format(code,id);
        delete df;
        r = (Region *)uhash_get(regionAliases,&id);
    }

    if ( !r ) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }

    if ( r->type == URGN_DEPRECATED && r->preferredValues->size() == 1) {
        StringEnumeration *pv = r->getPreferredValues();
        pv->reset(status);
        const UnicodeString *ustr = pv->snext(status);
        r = (Region *)uhash_get(regionIDMap,(void *)ustr);
        delete pv;
    }

    return r;
}
Example #4
0
/**
 * Returns a pointer to a Region using the given numeric region code. If the numeric region code is not recognized,
 * the appropriate error code will be set ( U_ILLEGAL_ARGUMENT_ERROR ).
 */
const Region* U_EXPORT2
Region::getInstance (int32_t code, UErrorCode &status) {

    umtx_initOnce(gRegionDataInitOnce, &loadRegionData, status);
    if (U_FAILURE(status)) {
        return NULL;
    }

    Region *r = (Region *)uhash_iget(numericCodeMap,code);

    if ( !r ) { // Just in case there's an alias that's numeric, try to find it.
        UnicodeString pat = UNICODE_STRING_SIMPLE("0");
        LocalPointer<DecimalFormat> df(new DecimalFormat(pat,status), status);
        if( U_FAILURE(status) ) {
            return NULL;
        }
        UnicodeString id;
        id.remove();
        FieldPosition posIter;
        df->format(code,id, posIter, status);
        r = (Region *)uhash_get(regionAliases,&id);
    }

    if( U_FAILURE(status) ) {
        return NULL;
    }

    if ( !r ) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }

    if ( r->type == URGN_DEPRECATED && r->preferredValues->size() == 1) {
        StringEnumeration *pv = r->getPreferredValues(status);
        pv->reset(status);
        const UnicodeString *ustr = pv->snext(status);
        r = (Region *)uhash_get(regionIDMap,(void *)ustr);
        delete pv;
    }

    return r;
}
Example #5
0
StringList *CEToStringsMap::getStringList(uint32_t ce) const
{
    return (StringList *) uhash_iget(map, ce);
}
Example #6
0
void ConfusabledataBuilder::addKeyEntry(
    UChar32     keyChar,     // The key character
    UHashtable *table,       // The table, one of SATable, MATable, etc.
    int32_t     tableFlag,   // One of USPOOF_SA_TABLE_FLAG, etc.
    UErrorCode &status) {

    SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(table, keyChar));
    if (targetMapping == NULL) {
        // No mapping for this key character.
        //   (This function is called for all four tables for each key char that
        //    is seen anywhere, so this no entry cases are very much expected.)
        return;
    }

    // Check whether there is already an entry with the correct mapping.
    // If so, simply set the flag in the keyTable saying that the existing entry
    // applies to the table that we're doing now.

    UBool keyHasMultipleValues = FALSE;
    int32_t i;
    for (i=fKeyVec->size()-1; i>=0 ; i--) {
        int32_t key = fKeyVec->elementAti(i);
        if ((key & 0x0ffffff) != keyChar) {
            // We have now checked all existing key entries for this key char (if any)
            //  without finding one with the same mapping.
            break;
        }
        UnicodeString mapping = getMapping(i);
        if (mapping == *(targetMapping->fStr)) {
            // The run time entry we are currently testing has the correct mapping.
            // Set the flag in it indicating that it applies to the new table also.
            key |= tableFlag;
            fKeyVec->setElementAt(key, i);
            return;
        }
        keyHasMultipleValues = TRUE;
    }

    // Need to add a new entry to the binary data being built for this mapping.
    // Includes adding entries to both the key table and the parallel values table.

    int32_t newKey = keyChar | tableFlag;
    if (keyHasMultipleValues) {
        newKey |= USPOOF_KEY_MULTIPLE_VALUES;
    }
    int32_t adjustedMappingLength = targetMapping->fStr->length() - 1;
    if (adjustedMappingLength>3) {
        adjustedMappingLength = 3;
    }
    newKey |= adjustedMappingLength << USPOOF_KEY_LENGTH_SHIFT;

    int32_t newData = targetMapping->fStrTableIndex;

    fKeyVec->addElement(newKey, status);
    fValueVec->addElement(newData, status);

    // If the preceding key entry is for the same key character (but with a different mapping)
    //   set the multiple-values flag on it.
    if (keyHasMultipleValues) {
        int32_t previousKeyIndex = fKeyVec->size() - 2;
        int32_t previousKey = fKeyVec->elementAti(previousKeyIndex);
        previousKey |= USPOOF_KEY_MULTIPLE_VALUES;
        fKeyVec->setElementAt(previousKey, previousKeyIndex);
    }
}
Example #7
0
NumberFormat*
NumberFormat::makeInstance(const Locale& desiredLocale,
                           EStyles style,
                           UErrorCode& status)
{
    if (U_FAILURE(status)) return NULL;

    if (style < 0 || style >= kStyleCount) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }

#ifdef U_WINDOWS
    char buffer[8];
    int32_t count = desiredLocale.getKeywordValue("compat", buffer, sizeof(buffer), status);

    // if the locale has "@compat=host", create a host-specific NumberFormat
    if (count > 0 && uprv_strcmp(buffer, "host") == 0) {
        Win32NumberFormat *f = NULL;
        UBool curr = TRUE;

        switch (style) {
        case kNumberStyle:
            curr = FALSE;
            // fall-through

        case kCurrencyStyle:
        case kIsoCurrencyStyle: // do not support plural formatting here
        case kPluralCurrencyStyle:
            f = new Win32NumberFormat(desiredLocale, curr, status);

            if (U_SUCCESS(status)) {
                return f;
            }

            delete f;
            break;

        default:
            break;
        }
    }
#endif

    NumberFormat* f = NULL;
    DecimalFormatSymbols* symbolsToAdopt = NULL;
    UnicodeString pattern;
    UResourceBundle *resource = ures_open((char *)0, desiredLocale.getName(), &status);
    UResourceBundle *numberPatterns = ures_getByKey(resource, DecimalFormat::fgNumberPatterns, NULL, &status);
    NumberingSystem *ns = NULL;
    UBool deleteSymbols = TRUE;
    UHashtable * cache = NULL;
    int32_t hashKey;
    UBool getCache = FALSE;
    UBool deleteNS = FALSE;

    if (U_FAILURE(status)) {
        // We don't appear to have resource data available -- use the last-resort data
        status = U_USING_FALLBACK_WARNING;
        // When the data is unavailable, and locale isn't passed in, last resort data is used.
        symbolsToAdopt = new DecimalFormatSymbols(status);

        // Creates a DecimalFormat instance with the last resort number patterns.
        pattern.setTo(TRUE, gLastResortNumberPatterns[style], -1);
    }
    else {
        // If not all the styled patterns exists for the NumberFormat in this locale,
        // sets the status code to failure and returns nil.
        if (ures_getSize(numberPatterns) < (int32_t)(sizeof(gLastResortNumberPatterns)/sizeof(gLastResortNumberPatterns[0])) -2 ) { //minus 2: ISO and plural
            status = U_INVALID_FORMAT_ERROR;
            goto cleanup;
        }

        // Loads the decimal symbols of the desired locale.
        symbolsToAdopt = new DecimalFormatSymbols(desiredLocale, status);

        int32_t patLen = 0;

        /* for ISOCURRENCYSTYLE and PLURALCURRENCYSTYLE,
         * the pattern is the same as the pattern of CURRENCYSTYLE
         * but by replacing the single currency sign with
         * double currency sign or triple currency sign.
         */
        int styleInNumberPattern = ((style == kIsoCurrencyStyle ||
                                     style == kPluralCurrencyStyle) ?
                                    kCurrencyStyle : style);

        const UChar *patResStr = ures_getStringByIndex(numberPatterns, (int32_t)styleInNumberPattern, &patLen, &status);

        // Creates the specified decimal format style of the desired locale.
        pattern.setTo(TRUE, patResStr, patLen);
    }
    if (U_FAILURE(status) || symbolsToAdopt == NULL) {
        goto cleanup;
    }
    if(style==kCurrencyStyle || style == kIsoCurrencyStyle){
        const UChar* currPattern = symbolsToAdopt->getCurrencyPattern();
        if(currPattern!=NULL){
            pattern.setTo(currPattern, u_strlen(currPattern));
        }
    }

    // Use numbering system cache hashtable
    UMTX_CHECK(&nscacheMutex, (UBool)(cache != NumberingSystem_cache), getCache);
    if (getCache) {
        umtx_lock(&nscacheMutex);
        cache = NumberingSystem_cache;
        umtx_unlock(&nscacheMutex);
    }

    // Check cache we got, create if non-existant
    status = U_ZERO_ERROR;
    if (cache == NULL) {
        cache = uhash_open(uhash_hashLong,
                           uhash_compareLong,
                           NULL,
                           &status);

        if (cache == NULL || U_FAILURE(status)) {
            // cache not created - out of memory
            cache = NULL;
        }
        else {
            // cache created
            uhash_setValueDeleter(cache, deleteNumberingSystem);

            // set final NumberingSystem_cache value
            UHashtable* h = NULL;

            UMTX_CHECK(&nscacheMutex, (UBool)(h != NumberingSystem_cache), getCache);
            if (getCache) {
                umtx_lock(&nscacheMutex);
                h = NumberingSystem_cache;
                umtx_unlock(&nscacheMutex);
            }
            if (h == NULL) {
                umtx_lock(&nscacheMutex);
                NumberingSystem_cache = h = cache;
                cache = NULL;
                ucln_i18n_registerCleanup(UCLN_I18N_NUMFMT, numfmt_cleanup);
                umtx_unlock(&nscacheMutex);
            }

            if(cache != NULL) {
              uhash_close(cache);
            }
            cache = h;
        }
    }

    // Get cached numbering system
    if (cache != NULL) {
        hashKey = desiredLocale.hashCode();

        umtx_lock(&nscacheMutex);
        ns = (NumberingSystem *)uhash_iget(cache, hashKey);
        if (ns == NULL) {
            ns = NumberingSystem::createInstance(desiredLocale,status);
            uhash_iput(cache, hashKey, (void*)ns, &status);
        }
        umtx_unlock(&nscacheMutex);
    }
    else {
        ns = NumberingSystem::createInstance(desiredLocale,status);
        deleteNS = TRUE;
    }

    // check results of getting a numbering system
    if ((ns == NULL) || (U_FAILURE(status))) {
        goto cleanup;
    }

    if (ns->isAlgorithmic()) {
        UnicodeString nsDesc;
        UnicodeString nsRuleSetGroup;
        UnicodeString nsRuleSetName;
        Locale nsLoc;
        URBNFRuleSetTag desiredRulesType = URBNF_NUMBERING_SYSTEM;

        nsDesc.setTo(ns->getDescription());
        int32_t firstSlash = nsDesc.indexOf(gSlash);
        int32_t lastSlash = nsDesc.lastIndexOf(gSlash);
        if ( lastSlash > firstSlash ) {
            char nsLocID[ULOC_FULLNAME_CAPACITY];

            nsDesc.extract(0,firstSlash,nsLocID,ULOC_FULLNAME_CAPACITY,US_INV);
            nsRuleSetGroup.setTo(nsDesc,firstSlash+1,lastSlash-firstSlash-1);
            nsRuleSetName.setTo(nsDesc,lastSlash+1);

            nsLoc = Locale::createFromName(nsLocID);

            UnicodeString SpelloutRules = UNICODE_STRING_SIMPLE("SpelloutRules");
            if ( nsRuleSetGroup.compare(SpelloutRules) == 0 ) {
                desiredRulesType = URBNF_SPELLOUT;
            }
        } else {
            nsLoc = desiredLocale;
            nsRuleSetName.setTo(nsDesc);
        }

        RuleBasedNumberFormat *r = new RuleBasedNumberFormat(desiredRulesType,nsLoc,status);

        if (U_FAILURE(status) || r == NULL) {
            goto cleanup;
        }
        r->setDefaultRuleSet(nsRuleSetName,status);
        f = (NumberFormat *) r;

    } else {
        // replace single currency sign in the pattern with double currency sign
        // if the style is kIsoCurrencyStyle
        if (style == kIsoCurrencyStyle) {
            pattern.findAndReplace(gSingleCurrencySign, gDoubleCurrencySign);
        }

        f = new DecimalFormat(pattern, symbolsToAdopt, style, status);
        if (U_FAILURE(status) || f == NULL) {
            goto cleanup;
        }
        deleteSymbols = FALSE;
    }

    f->setLocaleIDs(ures_getLocaleByType(numberPatterns, ULOC_VALID_LOCALE, &status),
                    ures_getLocaleByType(numberPatterns, ULOC_ACTUAL_LOCALE, &status));

cleanup:
    ures_close(numberPatterns);
    ures_close(resource);

    if (deleteNS && ns) {
        delete ns;
    }

    if (U_FAILURE(status)) {
        /* If f exists, then it will delete the symbols */
        if (f==NULL) {
            delete symbolsToAdopt;
        }
        else {
            delete f;
        }
        return NULL;
    }
    if (f == NULL || symbolsToAdopt == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        f = NULL;
    }
    if (deleteSymbols && symbolsToAdopt != NULL) {
        delete symbolsToAdopt;
    }
    return f;
}
NumberFormat*
NumberFormat::makeInstance(const Locale& desiredLocale,
                           UNumberFormatStyle style,
                           UBool mustBeDecimalFormat,
                           UErrorCode& status) {
    if (U_FAILURE(status)) return NULL;

    if (style < 0 || style >= UNUM_FORMAT_STYLE_COUNT) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }

    // Some styles are not supported. This is a result of merging
    // the @draft ICU 4.2 NumberFormat::EStyles into the long-existing UNumberFormatStyle.
    // Ticket #8503 is for reviewing/fixing/merging the two relevant implementations:
    // this one and unum_open().
    // The UNUM_PATTERN_ styles are not supported here
    // because this method does not take a pattern string.
    if (!isStyleSupported(style)) {
        status = U_UNSUPPORTED_ERROR;
        return NULL;
    }

#if U_PLATFORM_USES_ONLY_WIN32_API
    if (!mustBeDecimalFormat) {
        char buffer[8];
        int32_t count = desiredLocale.getKeywordValue("compat", buffer, sizeof(buffer), status);

        // if the locale has "@compat=host", create a host-specific NumberFormat
        if (U_SUCCESS(status) && count > 0 && uprv_strcmp(buffer, "host") == 0) {
            Win32NumberFormat *f = NULL;
            UBool curr = TRUE;

            switch (style) {
            case UNUM_DECIMAL:
                curr = FALSE;
                // fall-through

            case UNUM_CURRENCY:
            case UNUM_CURRENCY_ISO: // do not support plural formatting here
            case UNUM_CURRENCY_PLURAL:
                f = new Win32NumberFormat(desiredLocale, curr, status);

                if (U_SUCCESS(status)) {
                    return f;
                }

                delete f;
                break;
            default:
                break;
            }
        }
    }
#endif
    // Use numbering system cache hashtable
    umtx_initOnce(gNSCacheInitOnce, &nscacheInit);

    // Get cached numbering system
    LocalPointer<NumberingSystem> ownedNs;
    NumberingSystem *ns = NULL;
    if (NumberingSystem_cache != NULL) {
        // TODO: Bad hash key usage, see ticket #8504.
        int32_t hashKey = desiredLocale.hashCode();

        Mutex lock(&nscacheMutex);
        ns = (NumberingSystem *)uhash_iget(NumberingSystem_cache, hashKey);
        if (ns == NULL) {
            ns = NumberingSystem::createInstance(desiredLocale,status);
            uhash_iput(NumberingSystem_cache, hashKey, (void*)ns, &status);
        }
    } else {
        ownedNs.adoptInstead(NumberingSystem::createInstance(desiredLocale,status));
        ns = ownedNs.getAlias();
    }

    // check results of getting a numbering system
    if (U_FAILURE(status)) {
        return NULL;
    }

    if (mustBeDecimalFormat && ns->isAlgorithmic()) {
        status = U_UNSUPPORTED_ERROR;
        return NULL;
    }

    LocalPointer<DecimalFormatSymbols> symbolsToAdopt;
    UnicodeString pattern;
    LocalUResourceBundlePointer ownedResource(ures_open(NULL, desiredLocale.getName(), &status));
    if (U_FAILURE(status)) {
        // We don't appear to have resource data available -- use the last-resort data
        status = U_USING_FALLBACK_WARNING;
        // When the data is unavailable, and locale isn't passed in, last resort data is used.
        symbolsToAdopt.adoptInstead(new DecimalFormatSymbols(status));
        if (symbolsToAdopt.isNull()) {
            status = U_MEMORY_ALLOCATION_ERROR;
            return NULL;
        }

        // Creates a DecimalFormat instance with the last resort number patterns.
        pattern.setTo(TRUE, gLastResortNumberPatterns[style], -1);
    }
    else {
        // Loads the decimal symbols of the desired locale.
        symbolsToAdopt.adoptInstead(new DecimalFormatSymbols(desiredLocale, status));
        if (symbolsToAdopt.isNull()) {
            status = U_MEMORY_ALLOCATION_ERROR;
            return NULL;
        }

        UResourceBundle *resource = ownedResource.orphan();
        UResourceBundle *numElements = ures_getByKeyWithFallback(resource, gNumberElements, NULL, &status);
        resource = ures_getByKeyWithFallback(numElements, ns->getName(), resource, &status);
        resource = ures_getByKeyWithFallback(resource, gPatterns, resource, &status);
        ownedResource.adoptInstead(resource);

        int32_t patLen = 0;
        const UChar *patResStr = ures_getStringByKeyWithFallback(resource, gFormatKeys[style], &patLen, &status);

        // Didn't find a pattern specific to the numbering system, so fall back to "latn"
        if ( status == U_MISSING_RESOURCE_ERROR && uprv_strcmp(gLatn,ns->getName())) {
            status = U_ZERO_ERROR;
            resource = ures_getByKeyWithFallback(numElements, gLatn, resource, &status);
            resource = ures_getByKeyWithFallback(resource, gPatterns, resource, &status);
            patResStr = ures_getStringByKeyWithFallback(resource, gFormatKeys[style], &patLen, &status);
        }

        ures_close(numElements);

        // Creates the specified decimal format style of the desired locale.
        pattern.setTo(TRUE, patResStr, patLen);
    }
    if (U_FAILURE(status)) {
        return NULL;
    }
    if(style==UNUM_CURRENCY || style == UNUM_CURRENCY_ISO){
        const UChar* currPattern = symbolsToAdopt->getCurrencyPattern();
        if(currPattern!=NULL){
            pattern.setTo(currPattern, u_strlen(currPattern));
        }
    }


    NumberFormat *f;
    if (ns->isAlgorithmic()) {
        UnicodeString nsDesc;
        UnicodeString nsRuleSetGroup;
        UnicodeString nsRuleSetName;
        Locale nsLoc;
        URBNFRuleSetTag desiredRulesType = URBNF_NUMBERING_SYSTEM;

        nsDesc.setTo(ns->getDescription());
        int32_t firstSlash = nsDesc.indexOf(gSlash);
        int32_t lastSlash = nsDesc.lastIndexOf(gSlash);
        if ( lastSlash > firstSlash ) {
            CharString nsLocID;

            nsLocID.appendInvariantChars(nsDesc.tempSubString(0, firstSlash), status);
            nsRuleSetGroup.setTo(nsDesc,firstSlash+1,lastSlash-firstSlash-1);
            nsRuleSetName.setTo(nsDesc,lastSlash+1);

            nsLoc = Locale::createFromName(nsLocID.data());

            UnicodeString SpelloutRules = UNICODE_STRING_SIMPLE("SpelloutRules");
            if ( nsRuleSetGroup.compare(SpelloutRules) == 0 ) {
                desiredRulesType = URBNF_SPELLOUT;
            }
        } else {
            nsLoc = desiredLocale;
            nsRuleSetName.setTo(nsDesc);
        }

        RuleBasedNumberFormat *r = new RuleBasedNumberFormat(desiredRulesType,nsLoc,status);
        if (r == NULL) {
            status = U_MEMORY_ALLOCATION_ERROR;
            return NULL;
        }
        r->setDefaultRuleSet(nsRuleSetName,status);
        f = r;
    } else {
        // replace single currency sign in the pattern with double currency sign
        // if the style is UNUM_CURRENCY_ISO
        if (style == UNUM_CURRENCY_ISO) {
            pattern.findAndReplace(UnicodeString(TRUE, gSingleCurrencySign, 1),
                                   UnicodeString(TRUE, gDoubleCurrencySign, 2));
        }

        // "new DecimalFormat()" does not adopt the symbols if its memory allocation fails.
        DecimalFormatSymbols *syms = symbolsToAdopt.orphan();
        f = new DecimalFormat(pattern, syms, style, status);
        if (f == NULL) {
            delete syms;
            status = U_MEMORY_ALLOCATION_ERROR;
            return NULL;
        }
    }

    f->setLocaleIDs(ures_getLocaleByType(ownedResource.getAlias(), ULOC_VALID_LOCALE, &status),
                    ures_getLocaleByType(ownedResource.getAlias(), ULOC_ACTUAL_LOCALE, &status));
    if (U_FAILURE(status)) {
        delete f;
        return NULL;
    }
    return f;
}
Example #9
0
void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen,
               UErrorCode &status) {

    // Convert the user input data from UTF-8 to UChar (UTF-16)
    int32_t inputLen = 0;
    if (U_FAILURE(status)) {
        return;
    }
    u_strFromUTF8(NULL, 0, &inputLen, confusables, confusablesLen, &status);
    if (status != U_BUFFER_OVERFLOW_ERROR) {
        return;
    }
    status = U_ZERO_ERROR;
    fInput = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
    if (fInput == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    u_strFromUTF8(fInput, inputLen+1, NULL, confusables, confusablesLen, &status);


    // Regular Expression to parse a line from Confusables.txt.  The expression will match
    // any line.  What was matched is determined by examining which capture groups have a match.
    //   Capture Group 1:  the source char
    //   Capture Group 2:  the replacement chars
    //   Capture Group 3-6  the table type, SL, SA, ML, or MA (deprecated)
    //   Capture Group 7:  A blank or comment only line.
    //   Capture Group 8:  A syntactically invalid line.  Anything that didn't match before.
    // Example Line from the confusables.txt source file:
    //   "1D702 ;	006E 0329 ;	SL	# MATHEMATICAL ITALIC SMALL ETA ... "
    UnicodeString pattern(
        "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;"      // Match the source char
        "[ \\t]*([0-9A-Fa-f]+"                    // Match the replacement char(s)
           "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;"    //     (continued)
        "\\s*(?:(SL)|(SA)|(ML)|(MA))"             // Match the table type
        "[ \\t]*(?:#.*?)?$"                       // Match any trailing #comment
        "|^([ \\t]*(?:#.*?)?)$"       // OR match empty lines or lines with only a #comment
        "|^(.*?)$", -1, US_INV);      // OR match any line, which catches illegal lines.
    // TODO: Why are we using the regex C API here? C++ would just take UnicodeString...
    fParseLine = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);

    // Regular expression for parsing a hex number out of a space-separated list of them.
    //   Capture group 1 gets the number, with spaces removed.
    pattern = UNICODE_STRING_SIMPLE("\\s*([0-9A-F]+)");
    fParseHexNum = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);

    // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
    //   given the syntax of the input.
    if (*fInput == 0xfeff) {
        *fInput = 0x20;
    }

    // Parse the input, one line per iteration of this loop.
    uregex_setText(fParseLine, fInput, inputLen, &status);
    while (uregex_findNext(fParseLine, &status)) {
        fLineNum++;
        if (uregex_start(fParseLine, 7, &status) >= 0) {
            // this was a blank or comment line.
            continue;
        }
        if (uregex_start(fParseLine, 8, &status) >= 0) {
            // input file syntax error.
            status = U_PARSE_ERROR;
            return;
        }

        // We have a good input line.  Extract the key character and mapping string, and
        //    put them into the appropriate mapping table.
        UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status),
                          uregex_end(fParseLine, 1, &status), status);

        int32_t mapStringStart = uregex_start(fParseLine, 2, &status);
        int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart;
        uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status);

        UnicodeString  *mapString = new UnicodeString();
        if (mapString == NULL) {
            status = U_MEMORY_ALLOCATION_ERROR;
            return;
        }
        while (uregex_findNext(fParseHexNum, &status)) {
            UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status),
                                 uregex_end(fParseHexNum, 1, &status), status);
            mapString->append(c);
        }
        U_ASSERT(mapString->length() >= 1);

        // Put the map (value) string into the string pool
        // This a little like a Java intern() - any duplicates will be eliminated.
        SPUString *smapString = stringPool->addString(mapString, status);

        // Add the UChar32 -> string mapping to the table.
        // For Unicode 8, the SL, SA and ML tables have been discontinued.
        //                All input data from confusables.txt is tagged MA.
        uhash_iput(fTable, keyChar, smapString, &status);
        if (U_FAILURE(status)) { return; }
        fKeySet->add(keyChar);
    }

    // Input data is now all parsed and collected.
    // Now create the run-time binary form of the data.
    //
    // This is done in two steps.  First the data is assembled into vectors and strings,
    //   for ease of construction, then the contents of these collections are dumped
    //   into the actual raw-bytes data storage.

    // Build up the string array, and record the index of each string therein
    //  in the (build time only) string pool.
    // Strings of length one are not entered into the strings array.
    // (Strings in the table are sorted by length)
    stringPool->sort(status);
    fStringTable = new UnicodeString();
    int32_t poolSize = stringPool->size();
    int32_t i;
    for (i=0; i<poolSize; i++) {
        SPUString *s = stringPool->getByIndex(i);
        int32_t strLen = s->fStr->length();
        int32_t strIndex = fStringTable->length();
        if (strLen == 1) {
            // strings of length one do not get an entry in the string table.
            // Keep the single string character itself here, which is the same
            //  convention that is used in the final run-time string table index.
            s->fCharOrStrTableIndex = s->fStr->charAt(0);
        } else {
            s->fCharOrStrTableIndex = strIndex;
            fStringTable->append(*(s->fStr));
        }
    }

    // Construct the compile-time Key and Value tables
    //
    // For each key code point, check which mapping tables it applies to,
    //   and create the final data for the key & value structures.
    //
    //   The four logical mapping tables are conflated into one combined table.
    //   If multiple logical tables have the same mapping for some key, they
    //     share a single entry in the combined table.
    //   If more than one mapping exists for the same key code point, multiple
    //     entries will be created in the table

    for (int32_t range=0; range<fKeySet->getRangeCount(); range++) {
        // It is an oddity of the UnicodeSet API that simply enumerating the contained
        //   code points requires a nested loop.
        for (UChar32 keyChar=fKeySet->getRangeStart(range);
                keyChar <= fKeySet->getRangeEnd(range); keyChar++) {
            SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(fTable, keyChar));
            U_ASSERT(targetMapping != NULL);

            // Set an error code if trying to consume a long string.  Otherwise,
            // codePointAndLengthToKey will abort on a U_ASSERT.
            if (targetMapping->fStr->length() > 256) {
                status = U_ILLEGAL_ARGUMENT_ERROR;
                return;
            }

            int32_t key = ConfusableDataUtils::codePointAndLengthToKey(keyChar,
                targetMapping->fStr->length());
            int32_t value = targetMapping->fCharOrStrTableIndex;

            fKeyVec->addElement(key, status);
            fValueVec->addElement(value, status);
        }
    }

    // Put the assembled data into the flat runtime array
    outputData(status);

    // All of the intermediate allocated data belongs to the ConfusabledataBuilder
    //  object  (this), and is deleted in the destructor.
    return;
}