예제 #1
0
//----------------------------------------------------------------------------------------
//
//   findSetFor    given a UnicodeString,
//                  - find the corresponding Unicode Set  (uset node)
//                         (create one if necessary)
//                  - Set fLeftChild of the caller's node (should be a setRef node)
//                         to the uset node
//                 Maintain a hash table of uset nodes, so the same one is always used
//                    for the same string.
//                 If a "to adopt" set is provided and we haven't seen this key before,
//                    add the provided set to the hash table.
//                 If the string is one (32 bit) char in length, the set contains
//                    just one element which is the char in question.
//                 If the string is "any", return a set containing all chars.
//
//----------------------------------------------------------------------------------------
void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) {

    RBBISetTableEl   *el;

    // First check whether we've already cached a set for this string.
    // If so, just use the cached set in the new node.
    //   delete any set provided by the caller, since we own it.
    el = (RBBISetTableEl *)uhash_get(fSetTable, &s);
    if (el != NULL) {
        delete setToAdopt;
        node->fLeftChild = el->val;
        U_ASSERT(node->fLeftChild->fType == RBBINode::uset);
        return;
    }

    // Haven't seen this set before.
    // If the caller didn't provide us with a prebuilt set,
    //   create a new UnicodeSet now.
    if (setToAdopt == NULL) {
        if (s.compare(kAny, -1) == 0) {
            setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
        } else {
            UChar32 c;
            c = s.char32At(0);
            setToAdopt = new UnicodeSet(c, c);
        }
    }

    //
    // Make a new uset node to refer to this UnicodeSet
    // This new uset node becomes the child of the caller's setReference node.
    //
    RBBINode *usetNode    = new RBBINode(RBBINode::uset);
    usetNode->fInputSet   = setToAdopt;
    usetNode->fParent     = node;
    node->fLeftChild      = usetNode;
    usetNode->fText = s;


    //
    // Add the new uset node to the list of all uset nodes.
    //
    fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus);


    //
    // Add the new set to the set hash table.
    //
    el      = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl));
    UnicodeString *tkey = new UnicodeString(s);
    if (tkey == NULL || el == NULL || setToAdopt == NULL) {
        error(U_MEMORY_ALLOCATION_ERROR);
        return;
    }
    el->key = tkey;
    el->val = usetNode;
    uhash_put(fSetTable, el->key, el, fRB->fStatus);

    return;
}
예제 #2
0
void AlphabeticIndex::hackName(UnicodeString &dest, const UnicodeString &name, const Collator *col) {

    if (langType_ != kSimplified || !UNIHAN->contains(name.char32At(0))) {
        dest = name;
        return;
    }

    UErrorCode status = U_ZERO_ERROR;
    initPinyinBounds(col, status);
    if (U_FAILURE(status)) {
        dest = name;
        return;
    }
    // TODO:  use binary search
    int index;
    for (index=0; ; index++) {
        if ((*HACK_PINYIN_LOOKUP)[index][0] == (UChar)0xffff) {
            index--;
            break;
        }
        int32_t compareResult = col->compare(name, UnicodeString(TRUE, (*HACK_PINYIN_LOOKUP)[index], -1));
        if (compareResult < 0) {
            index--;
        }
        if (compareResult <= 0) {
            break;
        }
    }
    UChar c = PINYIN_LOWER_BOUNDS[index];
    dest.setTo(c);
    dest.append(name);
    return;
}
예제 #3
0
void AccumulativeWordCounter::operator+=(const UnicodeString& ustr)
{
	for(int32_t i=0; i<ustr.length(); i=ustr.moveIndex32(i,1))
	{
		this->operator+=(ustr.char32At(i));
	}
}
예제 #4
0
파일: uspoof.cpp 프로젝트: basti1302/node
U_I18N_API UnicodeString &  U_EXPORT2
uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
                                uint32_t /*type*/,
                                const UnicodeString &id,
                                UnicodeString &dest,
                                UErrorCode *status) {
    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    if (U_FAILURE(*status)) {
        return dest;
    }

    UnicodeString nfdId;
    gNfdNormalizer->normalize(id, nfdId, *status);

    // Apply the skeleton mapping to the NFD normalized input string
    // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
    int32_t inputIndex = 0;
    UnicodeString skelStr;
    int32_t normalizedLen = nfdId.length();
    for (inputIndex=0; inputIndex < normalizedLen; ) {
        UChar32 c = nfdId.char32At(inputIndex);
        inputIndex += U16_LENGTH(c);
        This->fSpoofData->confusableLookup(c, skelStr);
    }

    gNfdNormalizer->normalize(skelStr, dest, *status);
    return dest;
}
예제 #5
0
UnicodeString ICU_Utility::parseUnicodeIdentifier(const UnicodeString& str, int32_t& pos) {
    // assert(pos < str.length());
    // assert(!uprv_isRuleWhiteSpace(str.char32At(pos)));
    UnicodeString buf;
    int p = pos;
    while (p < str.length()) {
        UChar32 ch = str.char32At(p);
        if (buf.length() == 0) {
            if (u_isIDStart(ch)) {
                buf.append(ch);
            } else {
                buf.truncate(0);
                return buf;
            }
        } else {
            if (u_isIDPart(ch)) {
                buf.append(ch);
            } else {
                break;
            }
        }
        p += UTF_CHAR_LENGTH(ch);
    }
    pos = p;
    return buf;
}
예제 #6
0
int32_t ICU_Utility::parseNumber(const UnicodeString& text,
                                 int32_t& pos, int8_t radix) {
    // assert(pos[0] >= 0);
    // assert(radix >= 2);
    // assert(radix <= 36);
    int32_t n = 0;
    int32_t p = pos;
    while (p < text.length()) {
        UChar32 ch = text.char32At(p);
        int32_t d = u_digit(ch, radix);
        if (d < 0) {
            break;
        }
        n = radix*n + d;
        // ASSUME that when a 32-bit integer overflows it becomes
        // negative.  E.g., 214748364 * 10 + 8 => negative value.
        if (n < 0) {
            return -1;
        }
        ++p;
    }
    if (p == pos) {
        return -1;
    }
    pos = p;
    return n;
}
예제 #7
0
//---------------------------------------------------------------------------------------
//
//  wholeScriptCheck()
//
//      Input text is already normalized to NFD
//      Return the set of scripts, each of which can represent something that is
//             confusable with the input text.  The script of the input text
//             is included; input consisting of characters from a single script will
//             always produce a result consisting of a set containing that script.
//
//---------------------------------------------------------------------------------------
void SpoofImpl::wholeScriptCheck(
        const UnicodeString &text, ScriptSet *result, UErrorCode &status) const {

    UTrie2 *table =
        (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
    result->setAll();
    int32_t length = text.length();
    for (int32_t inputIdx=0; inputIdx < length;) {
        UChar32 c = text.char32At(inputIdx);
        inputIdx += U16_LENGTH(c);
        uint32_t index = utrie2_get32(table, c);
        if (index == 0) {
            // No confusables in another script for this char.
            // TODO:  we should change the data to have sets with just the single script
            //        bit for the script of this char.  Gets rid of this special case.
            //        Until then, grab the script from the char and intersect it with the set.
            UScriptCode cpScript = uscript_getScript(c, &status);
            U_ASSERT(cpScript > USCRIPT_INHERITED);
            result->intersect(cpScript, status);
        } else if (index == 1) {
            // Script == Common or Inherited.  Nothing to do.
        } else {
            result->intersect(fSpoofData->fScriptSets[index]);
        }
    }
}
예제 #8
0
void
TextTrieMap::search(CharacterNode *node, const UnicodeString &text, int32_t start,
                  int32_t index, TextTrieMapSearchResultHandler *handler, UErrorCode &status) const {
    if (U_FAILURE(status)) {
        return;
    }
    if (node->hasValues()) {
        if (!handler->handleMatch(index - start, node, status)) {
            return;
        }
        if (U_FAILURE(status)) {
            return;
        }
    }
    UChar32 c = text.char32At(index);
    if (fIgnoreCase) {
        // size of character may grow after fold operation
        UnicodeString tmp(c);
        tmp.foldCase();
        int32_t tmpidx = 0;
        while (tmpidx < tmp.length()) {
            c = tmp.char32At(tmpidx);
            node = getChildNode(node, c);
            if (node == NULL) {
                break;
            }
            tmpidx = tmp.moveIndex32(tmpidx, 1);
        }
    } else {
        node = getChildNode(node, c);
    }
    if (node != NULL) {
        search(node, text, start, index+1, handler, status);
    }
}
예제 #9
0
void StaticUnicodeSetsTest::assertInSet(const UnicodeString &localeName, const UnicodeString &setName,
                              const UnicodeSet &set, const UnicodeString &str) {
    if (str.countChar32(0, str.length()) != 1) {
        // Ignore locale strings with more than one code point (usually a bidi mark)
        return;
    }
    assertInSet(localeName, setName, set, str.char32At(0));
}
예제 #10
0
 void transform(const UnicodeString &word, CharString &buf, UErrorCode &errorCode) {
     UChar32 c = 0;
     int32_t len = word.length();
     for (int32_t i = 0; i < len; i += U16_LENGTH(c)) {
         c = word.char32At(i);
         buf.append(transform(c, errorCode), errorCode);
     }
 }
예제 #11
0
파일: common.cpp 프로젝트: kluge-iitk/pyicu
int32_t toUChar32(UnicodeString& u, UChar32 *c, UErrorCode& status)
{
#if U_ICU_VERSION_HEX >= 0x04020000
    return u.toUTF32(c, 1, status);
#else
    int32_t len = u.length();
    if (len >= 1)
        *c = u.char32At(0);
    return len;
#endif
}
예제 #12
0
bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const {
    if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
        return true;
    }
    UnicodeString skelStr;
    fSpoofData->confusableLookup(cp, skelStr);
    UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1));
    if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
        return true;
    }
    return false;
}
예제 #13
0
// Computes the set of numerics for a string, according to UTS 39 section 5.3.
void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
    result.clear();

    UChar32 codePoint;
    for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
        codePoint = input.char32At(i);

        // Store a representative character for each kind of decimal digit
        if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
            // Store the zero character as a representative for comparison.
            // Unicode guarantees it is codePoint - value
            result.add(codePoint - (UChar32)u_getNumericValue(codePoint));
        }
    }
}
void AlphabeticIndexTest::HackPinyinTest() {
    UErrorCode status = U_ZERO_ERROR;
    AlphabeticIndex aindex(Locale::createFromName("zh"), status);
    TEST_CHECK_STATUS; 

    UnicodeString names[sizeof(pinyinTestData) / sizeof(pinyinTestData[0])];
    int32_t  nameCount;
    for (nameCount=0; pinyinTestData[nameCount] != NULL; nameCount++) {
        names[nameCount] = UnicodeString(pinyinTestData[nameCount], -1, UnicodeString::kInvariant).unescape();
        aindex.addRecord(names[nameCount], &names[nameCount], status);
        TEST_CHECK_STATUS; 
        if (U_FAILURE(status)) {
            return;
        }
    }
    TEST_ASSERT(nameCount == aindex.getRecordCount(status));
    
    // Weak checking:  make sure that none of the Chinese names landed in the overflow bucket
    //   of the index, and that the names are distributed among several buckets.
    //   (Exact expected data would be subject to change with evolution of the collation rules.)

    int32_t bucketCount = 0;
    int32_t filledBucketCount = 0;
    while (aindex.nextBucket(status)) {
        bucketCount++;
        UnicodeString label = aindex.getBucketLabel();
        // std::string s;
        // std::cout << label.toUTF8String(s) << ":  ";

        UBool  bucketHasContents = FALSE;
        while (aindex.nextRecord(status)) {
            bucketHasContents = TRUE;
            UnicodeString name = aindex.getRecordName();
            if (aindex.getBucketLabelType() != U_ALPHAINDEX_NORMAL) {
                errln("File %s, Line %d, Name \"\\u%x\" is in an under or overflow bucket.",
                    __FILE__, __LINE__, name.char32At(0));
            }
            // s.clear();
            // std::cout << aindex.getRecordName().toUTF8String(s) << " ";
        }
        if (bucketHasContents) {
            filledBucketCount++;
        }
        // std::cout << std::endl;
    }
    TEST_ASSERT(bucketCount > 25);
    TEST_ASSERT(filledBucketCount > 15);
}
예제 #15
0
void UnicodeTest::TestScriptMetadata() {
    IcuTestErrorCode errorCode(*this, "TestScriptMetadata()");
    UnicodeSet rtl("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]", errorCode);
    // So far, sample characters are uppercase.
    // Georgian is special.
    UnicodeSet cased("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]", errorCode);
    for(int32_t sci = 0; sci < USCRIPT_CODE_LIMIT; ++sci) {
        UScriptCode sc = (UScriptCode)sci;
        // Run the test with -v to see which script has failures:
        // .../intltest$ make && ./intltest utility/UnicodeTest/TestScriptMetadata -v | grep -C 3 FAIL
        logln(uscript_getShortName(sc));
        UScriptUsage usage = uscript_getUsage(sc);
        UnicodeString sample = uscript_getSampleUnicodeString(sc);
        UnicodeSet scriptSet;
        scriptSet.applyIntPropertyValue(UCHAR_SCRIPT, sc, errorCode);
        if(usage == USCRIPT_USAGE_NOT_ENCODED) {
            assertTrue("not encoded, no sample", sample.isEmpty());
            assertFalse("not encoded, not RTL", uscript_isRightToLeft(sc));
            assertFalse("not encoded, not LB letters", uscript_breaksBetweenLetters(sc));
            assertFalse("not encoded, not cased", uscript_isCased(sc));
            assertTrue("not encoded, no characters", scriptSet.isEmpty());
        } else {
            assertFalse("encoded, has a sample character", sample.isEmpty());
            UChar32 firstChar = sample.char32At(0);
            UScriptCode charScript = getCharScript(sc);
            assertEquals("script(sample(script))",
                         charScript, uscript_getScript(firstChar, errorCode));
            assertEquals("RTL vs. set", rtl.contains(firstChar), uscript_isRightToLeft(sc));
            assertEquals("cased vs. set", cased.contains(firstChar), uscript_isCased(sc));
            assertEquals("encoded, has characters", sc == charScript, !scriptSet.isEmpty());
            if(uscript_isRightToLeft(sc)) {
                rtl.removeAll(scriptSet);
            }
            if(uscript_isCased(sc)) {
                cased.removeAll(scriptSet);
            }
        }
    }
    UnicodeString pattern;
    assertEquals("no remaining RTL characters",
                 UnicodeString("[]"), rtl.toPattern(pattern));
    assertEquals("no remaining cased characters",
                 UnicodeString("[]"), cased.toPattern(pattern));

    assertTrue("Hani breaks between letters", uscript_breaksBetweenLetters(USCRIPT_HAN));
    assertTrue("Thai breaks between letters", uscript_breaksBetweenLetters(USCRIPT_THAI));
    assertFalse("Latn does not break between letters", uscript_breaksBetweenLetters(USCRIPT_LATIN));
}
예제 #16
0
//------------------------------------------------------------------------------
//
//  stripRules    Return a rules string without extra spaces.
//                (Comments are removed separately, during rule parsing.)
//
//------------------------------------------------------------------------------
UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
    UnicodeString strippedRules;
    int32_t rulesLength = rules.length();
    bool skippingSpaces = false;

    for (int32_t idx=0; idx<rulesLength; idx = rules.moveIndex32(idx, 1)) {
        UChar32 cp = rules.char32At(idx);
        bool whiteSpace = u_hasBinaryProperty(cp, UCHAR_PATTERN_WHITE_SPACE);
        if (skippingSpaces && whiteSpace) {
            continue;
        }
        strippedRules.append(cp);
        skippingSpaces = whiteSpace;
    }
    return strippedRules;
}
예제 #17
0
U_I18N_API UnicodeString &  U_EXPORT2
uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
                                uint32_t type,
                                const UnicodeString &id,
                                UnicodeString &dest,
                                UErrorCode *status) {
    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    if (U_FAILURE(*status)) {
        return dest;
    }

   int32_t tableMask = 0;
   switch (type) {
      case 0:
        tableMask = USPOOF_ML_TABLE_FLAG;
        break;
      case USPOOF_SINGLE_SCRIPT_CONFUSABLE:
        tableMask = USPOOF_SL_TABLE_FLAG;
        break;
      case USPOOF_ANY_CASE:
        tableMask = USPOOF_MA_TABLE_FLAG;
        break;
      case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE:
        tableMask = USPOOF_SA_TABLE_FLAG;
        break;
      default:
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return dest;
    }

    UnicodeString nfdId;
    gNfdNormalizer->normalize(id, nfdId, *status);

    // Apply the skeleton mapping to the NFD normalized input string
    // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
    int32_t inputIndex = 0;
    UnicodeString skelStr;
    int32_t normalizedLen = nfdId.length();
    for (inputIndex=0; inputIndex < normalizedLen; ) {
        UChar32 c = nfdId.char32At(inputIndex);
        inputIndex += U16_LENGTH(c);
        This->confusableLookup(c, tableMask, skelStr);
    }

    gNfdNormalizer->normalize(skelStr, dest, *status);
    return dest;
}
예제 #18
0
파일: msgfmt.cpp 프로젝트: mason105/red5cpp
/**
 * Convert a string to an unsigned decimal, ignoring rule whitespace.
 * @return a non-negative number if successful, or a negative number
 *         upon failure.
 */
static int32_t stou(const UnicodeString& string) {
    int32_t n = 0;
    int32_t count = 0;
    UChar32 c;
    for (int32_t i=0; i<string.length(); i+=U16_LENGTH(c)) {
        c = string.char32At(i);
        if (uprv_isRuleWhiteSpace(c)) {
            continue;
        }
        int32_t d = u_digit(c, 10);
        if (d < 0 || ++count > 10) {
            return -1;
        }
        n = 10*n + d;
    }
    return n;
}
예제 #19
0
UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
    UnicodeString result;
    if (item.length() == 0) {
        return result;
    }
    int32_t i = 0;
    for (;;) {
        UChar32  cp = item.char32At(i);
        result.append(cp);
        i = item.moveIndex32(i, 1);
        if (i >= item.length()) {
            break;
        }
        result.append(CGJ);
    }
    return result;
}
예제 #20
0
int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const {
    bool sawLeadCharacter = false;
    for (int32_t i=0; i<input.length();) {
        UChar32 cp = input.char32At(i);
        if (sawLeadCharacter && cp == 0x0307) {
            return i;
        }
        uint8_t combiningClass = u_getCombiningClass(cp);
        // Skip over characters except for those with combining class 0 (non-combining characters) or with
        // combining class 230 (same class as U+0307)
        U_ASSERT(u_getCombiningClass(0x0307) == 230);
        if (combiningClass == 0 || combiningClass == 230) {
            sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp);
        }
        i += U16_LENGTH(cp);
    }
    return -1;
}
예제 #21
0
// Computes the resolved script set for a string, omitting characters having the specified script.
// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
    result.setAll();

    ScriptSet temp;
    UChar32 codePoint;
    for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
        codePoint = input.char32At(i);

        // Compute the augmented script set for the character
        getAugmentedScriptSet(codePoint, temp, status);
        if (U_FAILURE(status)) { return; }

        // Intersect the augmented script set with the resolved script set, but only if the character doesn't
        // have the script specified in the function call
        if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
            result.intersect(temp);
        }
    }
}
예제 #22
0
void CountWords(const UnicodeString& ustr, size_t& cnt, size_t& ctrl_cnt, size_t& sp_cnt)
{
	UErrorCode status = U_ZERO_ERROR;
	boost::scoped_ptr<BreakIterator> bi (
			BreakIterator::createWordInstance(Locale::getDefault(), status)
			);
	bi->setText(ustr);
	int32_t i = bi->first();
	while (i < ustr.length())
	{
		++cnt;

		UChar32 ch = ustr.char32At(i);

		if (u_iscntrl(ch))
			++ctrl_cnt;
		else if(u_isspace(ch))
			++sp_cnt;

		i = bi->next();
	}
}
예제 #23
0
UnicodeString& Transliterator::toRules(UnicodeString& rulesSource,
                                       UBool escapeUnprintable) const {
    // The base class implementation of toRules munges the ID into
    // the correct format.  That is: foo => ::foo
    if (escapeUnprintable) {
        rulesSource.truncate(0);
        UnicodeString id = getID();
        for (int32_t i=0; i<id.length();) {
            UChar32 c = id.char32At(i);
            if (!ICU_Utility::escapeUnprintable(rulesSource, c)) {
                rulesSource.append(c);
            }
            i += UTF_CHAR_LENGTH(c);
        }
    } else {
        rulesSource = getID();
    }
    // KEEP in sync with rbt_pars
    rulesSource.insert(0, UNICODE_STRING_SIMPLE("::"));
    rulesSource.append(ID_DELIM);
    return rulesSource;
}
예제 #24
0
static UBool changesWhenCasefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
    UnicodeString nfd;
    UErrorCode errorCode=U_ZERO_ERROR;
    const Normalizer2 *nfcNorm2=Normalizer2Factory::getNFCInstance(errorCode);
    if(U_FAILURE(errorCode)) {
        return FALSE;
    }
    if(nfcNorm2->getDecomposition(c, nfd)) {
        /* c has a decomposition */
        if(nfd.length()==1) {
            c=nfd[0];  /* single BMP code point */
        } else if(nfd.length()<=U16_MAX_LENGTH &&
                  nfd.length()==U16_LENGTH(c=nfd.char32At(0))
        ) {
            /* single supplementary code point */
        } else {
            c=U_SENTINEL;
        }
    } else if(c<0) {
        return FALSE;  /* protect against bad input */
    }
    if(c>=0) {
        /* single code point */
        const UCaseProps *csp=ucase_getSingleton();
        const UChar *resultString;
        return (UBool)(ucase_toFullFolding(csp, c, &resultString, U_FOLD_CASE_DEFAULT)>=0);
    } else {
        /* guess some large but stack-friendly capacity */
        UChar dest[2*UCASE_MAX_STRING_LENGTH];
        int32_t destLength;
        destLength=u_strFoldCase(dest, LENGTHOF(dest),
                                  nfd.getBuffer(), nfd.length(),
                                  U_FOLD_CASE_DEFAULT, &errorCode);
        return (UBool)(U_SUCCESS(errorCode) &&
                       0!=u_strCompare(nfd.getBuffer(), nfd.length(),
                                       dest, destLength, FALSE));
    }
}
예제 #25
0
U_CAPI int32_t U_EXPORT2
uspoof_checkUnicodeString(const USpoofChecker *sc,
                          const icu::UnicodeString &id, 
                          int32_t *position,
                          UErrorCode *status) {
    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    if (This == NULL) {
        return 0;
    }
    int32_t result = 0;

    IdentifierInfo *identifierInfo = NULL;
    if ((This->fChecks) & (USPOOF_RESTRICTION_LEVEL | USPOOF_MIXED_NUMBERS)) {
        identifierInfo = This->getIdentifierInfo(*status);
        if (U_FAILURE(*status)) {
            goto cleanupAndReturn;
        }
        identifierInfo->setIdentifier(id, *status);
        identifierInfo->setIdentifierProfile(*This->fAllowedCharsSet);
    }


    if ((This->fChecks) & USPOOF_RESTRICTION_LEVEL) {
        URestrictionLevel idRestrictionLevel = identifierInfo->getRestrictionLevel(*status);
        if (idRestrictionLevel > This->fRestrictionLevel) {
            result |= USPOOF_RESTRICTION_LEVEL;
        }
        if (This->fChecks & USPOOF_AUX_INFO) {
            result |= idRestrictionLevel;
        }
    }

    if ((This->fChecks) & USPOOF_MIXED_NUMBERS) {
        const UnicodeSet *numerics = identifierInfo->getNumerics();
        if (numerics->size() > 1) {
            result |= USPOOF_MIXED_NUMBERS;
        }

        // TODO: ICU4J returns the UnicodeSet of the numerics found in the identifier.
        //       We have no easy way to do the same in C.
        // if (checkResult != null) {
        //     checkResult.numerics = numerics;
        // }
    }


    if (This->fChecks & (USPOOF_CHAR_LIMIT)) {
        int32_t i;
        UChar32 c;
        int32_t length = id.length();
        for (i=0; i<length ;) {
            c = id.char32At(i);
            i += U16_LENGTH(c);
            if (!This->fAllowedCharsSet->contains(c)) {
                result |= USPOOF_CHAR_LIMIT;
                break;
            }
        }
    }

    if (This->fChecks & 
        (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
        // These are the checks that need to be done on NFD input
        UnicodeString nfdText;
        gNfdNormalizer->normalize(id, nfdText, *status);
        int32_t nfdLength = nfdText.length();

        if (This->fChecks & USPOOF_INVISIBLE) {
           
            // scan for more than one occurence of the same non-spacing mark
            // in a sequence of non-spacing marks.
            int32_t     i;
            UChar32     c;
            UChar32     firstNonspacingMark = 0;
            UBool       haveMultipleMarks = FALSE;  
            UnicodeSet  marksSeenSoFar;   // Set of combining marks in a single combining sequence.
            
            for (i=0; i<nfdLength ;) {
                c = nfdText.char32At(i);
                i += U16_LENGTH(c);
                if (u_charType(c) != U_NON_SPACING_MARK) {
                    firstNonspacingMark = 0;
                    if (haveMultipleMarks) {
                        marksSeenSoFar.clear();
                        haveMultipleMarks = FALSE;
                    }
                    continue;
                }
                if (firstNonspacingMark == 0) {
                    firstNonspacingMark = c;
                    continue;
                }
                if (!haveMultipleMarks) {
                    marksSeenSoFar.add(firstNonspacingMark);
                    haveMultipleMarks = TRUE;
                }
                if (marksSeenSoFar.contains(c)) {
                    // report the error, and stop scanning.
                    // No need to find more than the first failure.
                    result |= USPOOF_INVISIBLE;
                    break;
                }
                marksSeenSoFar.add(c);
            }
        }
       
        
        if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
            // The basic test is the same for both whole and mixed script confusables.
            // Compute the set of scripts that every input character has a confusable in.
            // For this computation an input character is always considered to be
            // confusable with itself in its own script.
            //
            // If the number of such scripts is two or more, and the input consisted of
            // characters all from a single script, we have a whole script confusable.
            // (The two scripts will be the original script and the one that is confusable)
            //
            // If the number of such scripts >= one, and the original input contained characters from
            // more than one script, we have a mixed script confusable.  (We can transform
            // some of the characters, and end up with a visually similar string all in
            // one script.)

            if (identifierInfo == NULL) {
                identifierInfo = This->getIdentifierInfo(*status);
                if (U_FAILURE(*status)) {
                    goto cleanupAndReturn;
                }
                identifierInfo->setIdentifier(id, *status);
            }

            int32_t scriptCount = identifierInfo->getScriptCount();
            
            ScriptSet scripts;
            This->wholeScriptCheck(nfdText, &scripts, *status);
            int32_t confusableScriptCount = scripts.countMembers();
            //printf("confusableScriptCount = %d\n", confusableScriptCount);
            
            if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
                confusableScriptCount >= 2 &&
                scriptCount == 1) {
                result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
            }
        
            if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
                confusableScriptCount >= 1 &&
                scriptCount > 1) {
                result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
            }
        }
    }

cleanupAndReturn:
    This->releaseIdentifierInfo(identifierInfo);
    if (position != NULL) {
        *position = 0;
    }
    return result;
}
예제 #26
0
UnicodeString& RelativeDateFormat::format(  Calendar& cal,
                                UnicodeString& appendTo,
                                FieldPosition& pos) const {

    UErrorCode status = U_ZERO_ERROR;
    UnicodeString relativeDayString;
    UDisplayContext capitalizationContext = getContext(UDISPCTX_TYPE_CAPITALIZATION, status);

    // calculate the difference, in days, between 'cal' and now.
    int dayDiff = dayDifference(cal, status);

    // look up string
    int32_t len = 0;
    const UChar *theString = getStringForDay(dayDiff, len, status);
    if(U_SUCCESS(status) && (theString!=NULL)) {
        // found a relative string
        relativeDayString.setTo(theString, len);
    }

    if ( relativeDayString.length() > 0 && !fDatePattern.isEmpty() &&
         (fTimePattern.isEmpty() || fCombinedFormat == NULL || fCombinedHasDateAtStart)) {
#if !UCONFIG_NO_BREAK_ITERATION
        // capitalize relativeDayString according to context for relative, set formatter no context
        if ( u_islower(relativeDayString.char32At(0)) && fCapitalizationBrkIter!= NULL &&
             ( capitalizationContext==UDISPCTX_CAPITALIZATION_FOR_BEGINNING_OF_SENTENCE ||
               (capitalizationContext==UDISPCTX_CAPITALIZATION_FOR_UI_LIST_OR_MENU && fCapitalizationOfRelativeUnitsForUIListMenu) ||
               (capitalizationContext==UDISPCTX_CAPITALIZATION_FOR_STANDALONE && fCapitalizationOfRelativeUnitsForStandAlone) ) ) {
            // titlecase first word of relativeDayString
            relativeDayString.toTitle(fCapitalizationBrkIter, fLocale, U_TITLECASE_NO_LOWERCASE | U_TITLECASE_NO_BREAK_ADJUSTMENT);
        }
#endif
        fDateTimeFormatter->setContext(UDISPCTX_CAPITALIZATION_NONE, status);
    } else {
        // set our context for the formatter
        fDateTimeFormatter->setContext(capitalizationContext, status);
    }

    if (fDatePattern.isEmpty()) {
        fDateTimeFormatter->applyPattern(fTimePattern);
        fDateTimeFormatter->format(cal,appendTo,pos);
    } else if (fTimePattern.isEmpty() || fCombinedFormat == NULL) {
        if (relativeDayString.length() > 0) {
            appendTo.append(relativeDayString);
        } else {
            fDateTimeFormatter->applyPattern(fDatePattern);
            fDateTimeFormatter->format(cal,appendTo,pos);
        }
    } else {
        UnicodeString datePattern;
        if (relativeDayString.length() > 0) {
            // Need to quote the relativeDayString to make it a legal date pattern
            relativeDayString.findAndReplace(UNICODE_STRING("'", 1), UNICODE_STRING("''", 2)); // double any existing APOSTROPHE
            relativeDayString.insert(0, APOSTROPHE); // add APOSTROPHE at beginning...
            relativeDayString.append(APOSTROPHE); // and at end
            datePattern.setTo(relativeDayString);
        } else {
            datePattern.setTo(fDatePattern);
        }
        UnicodeString combinedPattern;
        fCombinedFormat->format(fTimePattern, datePattern, combinedPattern, status);
        fDateTimeFormatter->applyPattern(combinedPattern);
        fDateTimeFormatter->format(cal,appendTo,pos);
    }

    return appendTo;
}
예제 #27
0
//  testConfData - Check each data item from the Unicode confusables.txt file,
//                 verify that it transforms correctly in a skeleton.
//
void IntlTestSpoof::testConfData() {
    UErrorCode status = U_ZERO_ERROR;

    const char *testDataDir = IntlTest::getSourceTestData(status);
    TEST_ASSERT_SUCCESS(status);
    char buffer[2000];
    uprv_strcpy(buffer, testDataDir);
    uprv_strcat(buffer, "confusables.txt");

    LocalStdioFilePointer f(fopen(buffer, "rb"));
    if (f.isNull()) {
        errln("Skipping test spoof/testConfData.  File confusables.txt not accessible.");
        return;
    }
    fseek(f.getAlias(), 0, SEEK_END);
    int32_t  fileSize = ftell(f.getAlias());
    LocalArray<char> fileBuf(new char[fileSize]);
    fseek(f.getAlias(), 0, SEEK_SET);
    int32_t amt_read = fread(fileBuf.getAlias(), 1, fileSize, f.getAlias());
    TEST_ASSERT_EQ(amt_read, fileSize);
    TEST_ASSERT(fileSize>0);
    if (amt_read != fileSize || fileSize <=0) {
        return;
    }
    UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf.getAlias(), fileSize));

    LocalUSpoofCheckerPointer sc(uspoof_open(&status));
    TEST_ASSERT_SUCCESS(status);

    // Parse lines from the confusables.txt file.  Example Line:
    // FF44 ;	0064 ;	SL	# ( d -> d ) FULLWIDTH ....
    // Three fields.  The hex fields can contain more than one character,
    //                and each character may be more than 4 digits (for supplemntals)
    // This regular expression matches lines and splits the fields into capture groups.
    RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt, 0, status);
    TEST_ASSERT_SUCCESS(status);
    while (parseLine.find()) {
        UnicodeString from = parseHex(parseLine.group(1, status));
        if (!Normalizer::isNormalized(from, UNORM_NFD, status)) {
            // The source character was not NFD.
            // Skip this case; the first step in obtaining a skeleton is to NFD the input,
            //  so the mapping in this line of confusables.txt will never be applied.
            continue;
        }

        UnicodeString rawExpected = parseHex(parseLine.group(2, status));
        UnicodeString expected;
        Normalizer::decompose(rawExpected, FALSE /*NFD*/, 0, expected, status);
        TEST_ASSERT_SUCCESS(status);

        int32_t skeletonType = 0;
        UnicodeString tableType = parseLine.group(3, status);
        TEST_ASSERT_SUCCESS(status);
        if (tableType.indexOf("SL") >= 0) {
            skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
        } else if (tableType.indexOf("SA") >= 0) {
            skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;
        } else if (tableType.indexOf("ML") >= 0) {
            skeletonType = 0;
        } else if (tableType.indexOf("MA") >= 0) {
            skeletonType = USPOOF_ANY_CASE;
        }

        UnicodeString actual;
        uspoof_getSkeletonUnicodeString(sc.getAlias(), skeletonType, from, actual, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT(actual == expected);
        if (actual != expected) {
            errln(parseLine.group(0, status));
            UnicodeString line = "Actual: ";
            int i = 0;
            while (i < actual.length()) {
                appendHexUChar(line, actual.char32At(i));
                i = actual.moveIndex32(i, 1);
            }
            errln(line);
        }
        if (U_FAILURE(status)) {
            break;
        }
    }
}
예제 #28
0
 virtual UChar32 getChar32At(int32_t offset) const{
     return chars.char32At(offset);
 }
void
DecimalFormatPatternParser::applyPatternWithoutExpandAffix(
        const UnicodeString& pattern,
        DecimalFormatPattern& out,
        UParseError& parseError,
        UErrorCode& status) {
    if (U_FAILURE(status))
    {
        return;
    }
    out = DecimalFormatPattern();

    // Clear error struct
    parseError.offset = -1;
    parseError.preContext[0] = parseError.postContext[0] = (UChar)0;

    // TODO: Travis Keep: This won't always work.
    UChar nineDigit = (UChar)(fZeroDigit + 9);
    int32_t digitLen = fDigit.length();
    int32_t groupSepLen = fGroupingSeparator.length();
    int32_t decimalSepLen = fDecimalSeparator.length();

    int32_t pos = 0;
    int32_t patLen = pattern.length();
    // Part 0 is the positive pattern.  Part 1, if present, is the negative
    // pattern.
    for (int32_t part=0; part<2 && pos<patLen; ++part) {
        // The subpart ranges from 0 to 4: 0=pattern proper, 1=prefix,
        // 2=suffix, 3=prefix in quote, 4=suffix in quote.  Subpart 0 is
        // between the prefix and suffix, and consists of pattern
        // characters.  In the prefix and suffix, percent, perMill, and
        // currency symbols are recognized and translated.
        int32_t subpart = 1, sub0Start = 0, sub0Limit = 0, sub2Limit = 0;

        // It's important that we don't change any fields of this object
        // prematurely.  We set the following variables for the multiplier,
        // grouping, etc., and then only change the actual object fields if
        // everything parses correctly.  This also lets us register
        // the data from part 0 and ignore the part 1, except for the
        // prefix and suffix.
        UnicodeString prefix;
        UnicodeString suffix;
        int32_t decimalPos = -1;
        int32_t multiplier = 1;
        int32_t digitLeftCount = 0, zeroDigitCount = 0, digitRightCount = 0, sigDigitCount = 0;
        int8_t groupingCount = -1;
        int8_t groupingCount2 = -1;
        int32_t padPos = -1;
        UChar32 padChar = 0;
        int32_t roundingPos = -1;
        DigitList roundingInc;
        int8_t expDigits = -1;
        UBool expSignAlways = FALSE;

        // The affix is either the prefix or the suffix.
        UnicodeString* affix = &prefix;

        int32_t start = pos;
        UBool isPartDone = FALSE;
        UChar32 ch;

        for (; !isPartDone && pos < patLen; ) {
            // Todo: account for surrogate pairs
            ch = pattern.char32At(pos);
            switch (subpart) {
            case 0: // Pattern proper subpart (between prefix & suffix)
                // Process the digits, decimal, and grouping characters.  We
                // record five pieces of information.  We expect the digits
                // to occur in the pattern ####00.00####, and we record the
                // number of left digits, zero (central) digits, and right
                // digits.  The position of the last grouping character is
                // recorded (should be somewhere within the first two blocks
                // of characters), as is the position of the decimal point,
                // if any (should be in the zero digits).  If there is no
                // decimal point, then there should be no right digits.
                if (pattern.compare(pos, digitLen, fDigit) == 0) {
                    if (zeroDigitCount > 0 || sigDigitCount > 0) {
                        ++digitRightCount;
                    } else {
                        ++digitLeftCount;
                    }
                    if (groupingCount >= 0 && decimalPos < 0) {
                        ++groupingCount;
                    }
                    pos += digitLen;
                } else if ((ch >= fZeroDigit && ch <= nineDigit) ||
                           ch == fSigDigit) {
                    if (digitRightCount > 0) {
                        // Unexpected '0'
                        debug("Unexpected '0'")
                        status = U_UNEXPECTED_TOKEN;
                        syntaxError(pattern,pos,parseError);
                        return;
                    }
                    if (ch == fSigDigit) {
                        ++sigDigitCount;
                    } else {
                        if (ch != fZeroDigit && roundingPos < 0) {
                            roundingPos = digitLeftCount + zeroDigitCount;
                        }
                        if (roundingPos >= 0) {
                            roundingInc.append((char)(ch - fZeroDigit + '0'));
                        }
                        ++zeroDigitCount;
                    }
                    if (groupingCount >= 0 && decimalPos < 0) {
                        ++groupingCount;
                    }
                    pos += U16_LENGTH(ch);
                } else if (pattern.compare(pos, groupSepLen, fGroupingSeparator) == 0) {
                    if (decimalPos >= 0) {
                        // Grouping separator after decimal
                        debug("Grouping separator after decimal")
                        status = U_UNEXPECTED_TOKEN;
                        syntaxError(pattern,pos,parseError);
                        return;
                    }
                    groupingCount2 = groupingCount;
                    groupingCount = 0;
                    pos += groupSepLen;
                } else if (pattern.compare(pos, decimalSepLen, fDecimalSeparator) == 0) {
                    if (decimalPos >= 0) {
                        // Multiple decimal separators
                        debug("Multiple decimal separators")
                        status = U_MULTIPLE_DECIMAL_SEPARATORS;
                        syntaxError(pattern,pos,parseError);
                        return;
                    }
                    // Intentionally incorporate the digitRightCount,
                    // even though it is illegal for this to be > 0
                    // at this point.  We check pattern syntax below.
                    decimalPos = digitLeftCount + zeroDigitCount + digitRightCount;
                    pos += decimalSepLen;
                } else {
                    if (pattern.compare(pos, fExponent.length(), fExponent) == 0) {
                        if (expDigits >= 0) {
                            // Multiple exponential symbols
                            debug("Multiple exponential symbols")
                            status = U_MULTIPLE_EXPONENTIAL_SYMBOLS;
                            syntaxError(pattern,pos,parseError);
                            return;
                        }
                        if (groupingCount >= 0) {
                            // Grouping separator in exponential pattern
                            debug("Grouping separator in exponential pattern")
                            status = U_MALFORMED_EXPONENTIAL_PATTERN;
                            syntaxError(pattern,pos,parseError);
                            return;
                        }
                        pos += fExponent.length();
                        // Check for positive prefix
                        if (pos < patLen
                            && pattern.compare(pos, fPlus.length(), fPlus) == 0) {
                            expSignAlways = TRUE;
                            pos += fPlus.length();
                        }
                        // Use lookahead to parse out the exponential part of the
                        // pattern, then jump into suffix subpart.
                        expDigits = 0;
                        while (pos < patLen &&
                               pattern.char32At(pos) == fZeroDigit) {
                            ++expDigits;
                            pos += U16_LENGTH(fZeroDigit);
                        }

                        // 1. Require at least one mantissa pattern digit
                        // 2. Disallow "#+ @" in mantissa
                        // 3. Require at least one exponent pattern digit
                        if (((digitLeftCount + zeroDigitCount) < 1 &&
                             (sigDigitCount + digitRightCount) < 1) ||
                            (sigDigitCount > 0 && digitLeftCount > 0) ||
                            expDigits < 1) {
                            // Malformed exponential pattern
                            debug("Malformed exponential pattern")
                            status = U_MALFORMED_EXPONENTIAL_PATTERN;
                            syntaxError(pattern,pos,parseError);
                            return;
                        }
                    }
                    // Transition to suffix subpart
                    subpart = 2; // suffix subpart
                    affix = &suffix;
                    sub0Limit = pos;
                    continue;
                }
                break;
            case 1: // Prefix subpart
            case 2: // Suffix subpart
                // Process the prefix / suffix characters
                // Process unquoted characters seen in prefix or suffix
                // subpart.

                // Several syntax characters implicitly begins the
                // next subpart if we are in the prefix; otherwise
                // they are illegal if unquoted.
                if (!pattern.compare(pos, digitLen, fDigit) ||
                    !pattern.compare(pos, groupSepLen, fGroupingSeparator) ||
                    !pattern.compare(pos, decimalSepLen, fDecimalSeparator) ||
                    (ch >= fZeroDigit && ch <= nineDigit) ||
                    ch == fSigDigit) {
                    if (subpart == 1) { // prefix subpart
                        subpart = 0; // pattern proper subpart
                        sub0Start = pos; // Reprocess this character
                        continue;
                    } else {
                        status = U_UNQUOTED_SPECIAL;
                        syntaxError(pattern,pos,parseError);
                        return;
                    }
                } else if (ch == kCurrencySign) {
                    affix->append(kQuote); // Encode currency
                    // Use lookahead to determine if the currency sign is
                    // doubled or not.
                    U_ASSERT(U16_LENGTH(kCurrencySign) == 1);
                    if ((pos+1) < pattern.length() && pattern[pos+1] == kCurrencySign) {
                        affix->append(kCurrencySign);
                        ++pos; // Skip over the doubled character
                        if ((pos+1) < pattern.length() &&
                            pattern[pos+1] == kCurrencySign) {
                            affix->append(kCurrencySign);
                            ++pos; // Skip over the doubled character
                            out.fCurrencySignCount = fgCurrencySignCountInPluralFormat;
                        } else {
                            out.fCurrencySignCount = fgCurrencySignCountInISOFormat;
                        }
                    } else {
                        out.fCurrencySignCount = fgCurrencySignCountInSymbolFormat;
                    }
                    // Fall through to append(ch)
                } else if (ch == kQuote) {
                    // A quote outside quotes indicates either the opening
                    // quote or two quotes, which is a quote literal.  That is,
                    // we have the first quote in 'do' or o''clock.
                    U_ASSERT(U16_LENGTH(kQuote) == 1);
                    ++pos;
                    if (pos < pattern.length() && pattern[pos] == kQuote) {
                        affix->append(kQuote); // Encode quote
                        // Fall through to append(ch)
                    } else {
                        subpart += 2; // open quote
                        continue;
                    }
                } else if (pattern.compare(pos, fSeparator.length(), fSeparator) == 0) {
                    // Don't allow separators in the prefix, and don't allow
                    // separators in the second pattern (part == 1).
                    if (subpart == 1 || part == 1) {
                        // Unexpected separator
                        debug("Unexpected separator")
                        status = U_UNEXPECTED_TOKEN;
                        syntaxError(pattern,pos,parseError);
                        return;
                    }
                    sub2Limit = pos;
                    isPartDone = TRUE; // Go to next part
                    pos += fSeparator.length();
                    break;
                } else if (pattern.compare(pos, fPercent.length(), fPercent) == 0) {
                    // Next handle characters which are appended directly.
                    if (multiplier != 1) {
                        // Too many percent/perMill characters
                        debug("Too many percent characters")
                        status = U_MULTIPLE_PERCENT_SYMBOLS;
                        syntaxError(pattern,pos,parseError);
                        return;
                    }
                    affix->append(kQuote); // Encode percent/perMill
                    affix->append(kPatternPercent); // Use unlocalized pattern char
                    multiplier = 100;
                    pos += fPercent.length();
                    break;
                } else if (pattern.compare(pos, fPerMill.length(), fPerMill) == 0) {
                    // Next handle characters which are appended directly.
                    if (multiplier != 1) {
                        // Too many percent/perMill characters
                        debug("Too many perMill characters")
                        status = U_MULTIPLE_PERMILL_SYMBOLS;
                        syntaxError(pattern,pos,parseError);
                        return;
                    }
                    affix->append(kQuote); // Encode percent/perMill
                    affix->append(kPatternPerMill); // Use unlocalized pattern char
                    multiplier = 1000;
                    pos += fPerMill.length();
                    break;
                } else if (pattern.compare(pos, fPadEscape.length(), fPadEscape) == 0) {
                    if (padPos >= 0 ||               // Multiple pad specifiers
                        (pos+1) == pattern.length()) { // Nothing after padEscape
                        debug("Multiple pad specifiers")
                        status = U_MULTIPLE_PAD_SPECIFIERS;
                        syntaxError(pattern,pos,parseError);
                        return;
                    }
                    padPos = pos;
                    pos += fPadEscape.length();
                    padChar = pattern.char32At(pos);
                    pos += U16_LENGTH(padChar);
                    break;
                } else if (pattern.compare(pos, fMinus.length(), fMinus) == 0) {
                    affix->append(kQuote); // Encode minus
                    affix->append(kPatternMinus);
                    pos += fMinus.length();
                    break;
                } else if (pattern.compare(pos, fPlus.length(), fPlus) == 0) {
                    affix->append(kQuote); // Encode plus
                    affix->append(kPatternPlus);
                    pos += fPlus.length();
                    break;
                }
                // Unquoted, non-special characters fall through to here, as
                // well as other code which needs to append something to the
                // affix.
                affix->append(ch);
                pos += U16_LENGTH(ch);
                break;
            case 3: // Prefix subpart, in quote
            case 4: // Suffix subpart, in quote
                // A quote within quotes indicates either the closing
                // quote or two quotes, which is a quote literal.  That is,
                // we have the second quote in 'do' or 'don''t'.
                if (ch == kQuote) {
                    ++pos;
                    if (pos < pattern.length() && pattern[pos] == kQuote) {
                        affix->append(kQuote); // Encode quote
                        // Fall through to append(ch)
                    } else {
                        subpart -= 2; // close quote
                        continue;
                    }
                }
                affix->append(ch);
                pos += U16_LENGTH(ch);
                break;
            }
        }

        if (sub0Limit == 0) {
            sub0Limit = pattern.length();
        }

        if (sub2Limit == 0) {
            sub2Limit = pattern.length();
        }

        /* Handle patterns with no '0' pattern character.  These patterns
         * are legal, but must be recodified to make sense.  "##.###" ->
         * "#0.###".  ".###" -> ".0##".
         *
         * We allow patterns of the form "####" to produce a zeroDigitCount
         * of zero (got that?); although this seems like it might make it
         * possible for format() to produce empty strings, format() checks
         * for this condition and outputs a zero digit in this situation.
         * Having a zeroDigitCount of zero yields a minimum integer digits
         * of zero, which allows proper round-trip patterns.  We don't want
         * "#" to become "#0" when toPattern() is called (even though that's
         * what it really is, semantically).
         */
        if (zeroDigitCount == 0 && sigDigitCount == 0 &&
            digitLeftCount > 0 && decimalPos >= 0) {
            // Handle "###.###" and "###." and ".###"
            int n = decimalPos;
            if (n == 0)
                ++n; // Handle ".###"
            digitRightCount = digitLeftCount - n;
            digitLeftCount = n - 1;
            zeroDigitCount = 1;
        }

        // Do syntax checking on the digits, decimal points, and quotes.
        if ((decimalPos < 0 && digitRightCount > 0 && sigDigitCount == 0) ||
            (decimalPos >= 0 &&
             (sigDigitCount > 0 ||
              decimalPos < digitLeftCount ||
              decimalPos > (digitLeftCount + zeroDigitCount))) ||
            groupingCount == 0 || groupingCount2 == 0 ||
            (sigDigitCount > 0 && zeroDigitCount > 0) ||
            subpart > 2)
        { // subpart > 2 == unmatched quote
            debug("Syntax error")
            status = U_PATTERN_SYNTAX_ERROR;
            syntaxError(pattern,pos,parseError);
            return;
        }

        // Make sure pad is at legal position before or after affix.
        if (padPos >= 0) {
            if (padPos == start) {
                padPos = DecimalFormatPattern::kPadBeforePrefix;
            } else if (padPos+2 == sub0Start) {
                padPos = DecimalFormatPattern::kPadAfterPrefix;
            } else if (padPos == sub0Limit) {
                padPos = DecimalFormatPattern::kPadBeforeSuffix;
            } else if (padPos+2 == sub2Limit) {
                padPos = DecimalFormatPattern::kPadAfterSuffix;
            } else {
                // Illegal pad position
                debug("Illegal pad position")
                status = U_ILLEGAL_PAD_POSITION;
                syntaxError(pattern,pos,parseError);
                return;
            }
        }

        if (part == 0) {
            out.fPosPatternsBogus = FALSE;
            out.fPosPrefixPattern = prefix;
            out.fPosSuffixPattern = suffix;
            out.fNegPatternsBogus = TRUE;
            out.fNegPrefixPattern.remove();
            out.fNegSuffixPattern.remove();

            out.fUseExponentialNotation = (expDigits >= 0);
            if (out.fUseExponentialNotation) {
              out.fMinExponentDigits = expDigits;
            }
            out.fExponentSignAlwaysShown = expSignAlways;
            int32_t digitTotalCount = digitLeftCount + zeroDigitCount + digitRightCount;
            // The effectiveDecimalPos is the position the decimal is at or
            // would be at if there is no decimal.  Note that if
            // decimalPos<0, then digitTotalCount == digitLeftCount +
            // zeroDigitCount.
            int32_t effectiveDecimalPos = decimalPos >= 0 ? decimalPos : digitTotalCount;
            UBool isSigDig = (sigDigitCount > 0);
            out.fUseSignificantDigits = isSigDig;
            if (isSigDig) {
                out.fMinimumSignificantDigits = sigDigitCount;
                out.fMaximumSignificantDigits = sigDigitCount + digitRightCount;
            } else {
                int32_t minInt = effectiveDecimalPos - digitLeftCount;
                out.fMinimumIntegerDigits = minInt;
                out.fMaximumIntegerDigits = out.fUseExponentialNotation
                    ? digitLeftCount + out.fMinimumIntegerDigits
                    : gDefaultMaxIntegerDigits;
                out.fMaximumFractionDigits = decimalPos >= 0
                    ? (digitTotalCount - decimalPos) : 0;
                out.fMinimumFractionDigits = decimalPos >= 0
                    ? (digitLeftCount + zeroDigitCount - decimalPos) : 0;
            }
            out.fGroupingUsed = groupingCount > 0;
            out.fGroupingSize = (groupingCount > 0) ? groupingCount : 0;
            out.fGroupingSize2 = (groupingCount2 > 0 && groupingCount2 != groupingCount)
                ? groupingCount2 : 0;
            out.fMultiplier = multiplier;
            out.fDecimalSeparatorAlwaysShown = decimalPos == 0
                    || decimalPos == digitTotalCount;
            if (padPos >= 0) {
                out.fPadPosition = (DecimalFormatPattern::EPadPosition) padPos;
                // To compute the format width, first set up sub0Limit -
                // sub0Start.  Add in prefix/suffix length later.

                // fFormatWidth = prefix.length() + suffix.length() +
                //    sub0Limit - sub0Start;
                out.fFormatWidth = sub0Limit - sub0Start;
                out.fPad = padChar;
            } else {
                out.fFormatWidth = 0;
            }
            if (roundingPos >= 0) {
                out.fRoundingIncrementUsed = TRUE;
                roundingInc.setDecimalAt(effectiveDecimalPos - roundingPos);
                out.fRoundingIncrement = roundingInc;
            } else {
                out.fRoundingIncrementUsed = FALSE;
            }
        } else {
            out.fNegPatternsBogus = FALSE;
            out.fNegPrefixPattern = prefix;
            out.fNegSuffixPattern = suffix;
        }
    }

    if (pattern.length() == 0) {
        out.fNegPatternsBogus = TRUE;
        out.fNegPrefixPattern.remove();
        out.fNegSuffixPattern.remove();
        out.fPosPatternsBogus = FALSE;
        out.fPosPrefixPattern.remove();
        out.fPosSuffixPattern.remove();

        out.fMinimumIntegerDigits = 0;
        out.fMaximumIntegerDigits = kDoubleIntegerDigits;
        out.fMinimumFractionDigits = 0;
        out.fMaximumFractionDigits = kDoubleFractionDigits;

        out.fUseExponentialNotation = FALSE;
        out.fCurrencySignCount = fgCurrencySignCountZero;
        out.fGroupingUsed = FALSE;
        out.fGroupingSize = 0;
        out.fGroupingSize2 = 0;
        out.fMultiplier = 1;
        out.fDecimalSeparatorAlwaysShown = FALSE;
        out.fFormatWidth = 0;
        out.fRoundingIncrementUsed = FALSE;
    }

    // If there was no negative pattern, or if the negative pattern is
    // identical to the positive pattern, then prepend the minus sign to the
    // positive pattern to form the negative pattern.
    if (out.fNegPatternsBogus ||
        (out.fNegPrefixPattern == out.fPosPrefixPattern
         && out.fNegSuffixPattern == out.fPosSuffixPattern)) {
        out.fNegPatternsBogus = FALSE;
        out.fNegSuffixPattern = out.fPosSuffixPattern;
        out.fNegPrefixPattern.remove();
        out.fNegPrefixPattern.append(kQuote).append(kPatternMinus)
            .append(out.fPosPrefixPattern);
    }
    // TODO: Deprecate/Remove out.fNegSuffixPattern and 3 other fields.
    AffixPattern::parseAffixString( 
            out.fNegSuffixPattern, out.fNegSuffixAffix, status);
    AffixPattern::parseAffixString(
            out.fPosSuffixPattern, out.fPosSuffixAffix, status);
    AffixPattern::parseAffixString(
            out.fNegPrefixPattern, out.fNegPrefixAffix, status);
    AffixPattern::parseAffixString(
            out.fPosPrefixPattern, out.fPosPrefixAffix, status);
}
예제 #30
0
/**
 * Dumb recursive implementation of permutation.
 * TODO: optimize
 * @param source the string to find permutations for
 * @return the results in a set.
 */
void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) {
    if(U_FAILURE(status)) {
        return;
    }
    //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source)));
    int32_t i = 0;

    // optimization:
    // if zero or one character, just return a set with it
    // we check for length < 2 to keep from counting code points all the time
    if (source.length() <= 2 && source.countChar32() <= 1) {
        UnicodeString *toPut = new UnicodeString(source);
        /* test for NULL */
        if (toPut == 0) {
            status = U_MEMORY_ALLOCATION_ERROR;
            return;
        }
        result->put(source, toPut, status);
        return;
    }

    // otherwise iterate through the string, and recursively permute all the other characters
    UChar32 cp;
    Hashtable subpermute(status);
    if(U_FAILURE(status)) {
        return;
    }
    subpermute.setValueDeleter(uprv_deleteUObject);

    for (i = 0; i < source.length(); i += U16_LENGTH(cp)) {
        cp = source.char32At(i);
        const UHashElement *ne = NULL;
        int32_t el = UHASH_FIRST;
        UnicodeString subPermuteString = source;

        // optimization:
        // if the character is canonical combining class zero,
        // don't permute it
        if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) {
            //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
            continue;
        }

        subpermute.removeAll();

        // see what the permutations of the characters before and after this one are
        //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));
        permute(subPermuteString.replace(i, U16_LENGTH(cp), NULL, 0), skipZeros, &subpermute, status);
        /* Test for buffer overflows */
        if(U_FAILURE(status)) {
            return;
        }
        // The upper replace is destructive. The question is do we have to make a copy, or we don't care about the contents 
        // of source at this point.

        // prefix this character to all of them
        ne = subpermute.nextElement(el);
        while (ne != NULL) {
            UnicodeString *permRes = (UnicodeString *)(ne->value.pointer);
            UnicodeString *chStr = new UnicodeString(cp);
            //test for  NULL
            if (chStr == NULL) {
                status = U_MEMORY_ALLOCATION_ERROR;
                return;
            }
            chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer));
            //if (PROGRESS) printf("  Piece: %s\n", UToS(*chStr));
            result->put(*chStr, chStr, status);
            ne = subpermute.nextElement(el);
        }
    }
    //return result;
}