Exemplo n.º 1
0
// This is mostly a port of the code in WebCore/editing/SmartReplaceCF.cpp
// except we use icu in place of CoreFoundations character classes.
static USet* getSmartSet(bool isPreviousCharacter)
{
    static USet* preSmartSet = nullptr;
    static USet* postSmartSet = nullptr;
    USet* smartSet = isPreviousCharacter ? preSmartSet : postSmartSet;
    if (!smartSet) {
        // Whitespace and newline (kCFCharacterSetWhitespaceAndNewline)
        UErrorCode ec = U_ZERO_ERROR;
        String whitespaceAndNewline("[[:WSpace:] [\\u000A\\u000B\\u000C\\u000D\\u0085]]");
        smartSet = uset_openPattern(whitespaceAndNewline.charactersWithNullTermination().data(), whitespaceAndNewline.length(), &ec);
        ASSERT(U_SUCCESS(ec));

        // CJK ranges
        uset_addRange(smartSet, 0x1100, 0x1100 + 256); // Hangul Jamo (0x1100 - 0x11FF)
        uset_addRange(smartSet, 0x2E80, 0x2E80 + 352); // CJK & Kangxi Radicals (0x2E80 - 0x2FDF)
        uset_addRange(smartSet, 0x2FF0, 0x2FF0 + 464); // Ideograph Descriptions, CJK Symbols, Hiragana, Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun, & Bopomofo Ext (0x2FF0 - 0x31BF)
        uset_addRange(smartSet, 0x3200, 0x3200 + 29392); // Enclosed CJK, CJK Ideographs (Uni Han & Ext A), & Yi (0x3200 - 0xA4CF)
        uset_addRange(smartSet, 0xAC00, 0xAC00 + 11183); // Hangul Syllables (0xAC00 - 0xD7AF)
        uset_addRange(smartSet, 0xF900, 0xF900 + 352); // CJK Compatibility Ideographs (0xF900 - 0xFA5F)
        uset_addRange(smartSet, 0xFE30, 0xFE30 + 32); // CJK Compatibility From (0xFE30 - 0xFE4F)
        uset_addRange(smartSet, 0xFF00, 0xFF00 + 240); // Half/Full Width Form (0xFF00 - 0xFFEF)
        uset_addRange(smartSet, 0x20000, 0x20000 + 0xA6D7); // CJK Ideograph Exntension B
        uset_addRange(smartSet, 0x2F800, 0x2F800 + 0x021E); // CJK Compatibility Ideographs (0x2F800 - 0x2FA1D)

        if (isPreviousCharacter) {
            addAllCodePoints(smartSet, "([\"\'#$/-`{");
            preSmartSet = smartSet;
        } else {
            addAllCodePoints(smartSet, ")].,;:?\'!\"%*-/}");

            // Punctuation (kCFCharacterSetPunctuation)
            UErrorCode ec = U_ZERO_ERROR;
            String punctuationClass("[:P:]");
            USet* icuPunct = uset_openPattern(punctuationClass.charactersWithNullTermination().data(), punctuationClass.length(), &ec);
            ASSERT(U_SUCCESS(ec));
            uset_addAll(smartSet, icuPunct);
            uset_close(icuPunct);

            postSmartSet = smartSet;
        }
    }
    return smartSet;
}
Exemplo n.º 2
0
static void TestFreezable() {
    USet *idSet;
    USet *frozen;
    USet *thawed;

    idSet=openIDSet();

    if (idSet == NULL) {
        log_data_err("openIDSet() returned NULL. (Are you missing data?)\n");
        uset_close(idSet);
        return;
    }

    frozen=uset_clone(idSet);

    if (frozen == NULL) {
        log_err("uset_Clone() returned NULL\n");
        return;
    }

    if(!uset_equals(frozen, idSet)) {
        log_err("uset_clone() did not make an equal copy\n");
    }

    uset_freeze(frozen);
    uset_addRange(frozen, 0xd802, 0xd805);

    if(uset_isFrozen(idSet) || !uset_isFrozen(frozen) || !uset_equals(frozen, idSet)) {
        log_err("uset_freeze() or uset_isFrozen() does not work\n");
    }

    thawed=uset_cloneAsThawed(frozen);

    if (thawed == NULL) {
        log_err("uset_cloneAsThawed(frozen) returned NULL");
        uset_close(frozen);
        uset_close(idSet);
        return;
    }

    uset_addRange(thawed, 0xd802, 0xd805);

    if(uset_isFrozen(thawed) || uset_equals(thawed, idSet) || !uset_containsRange(thawed, 0xd802, 0xd805)) {
        log_err("uset_cloneAsThawed() does not work\n");
    }

    uset_close(idSet);
    uset_close(frozen);
    uset_close(thawed);
}
Exemplo n.º 3
0
static void generateSelectorData(UConverterSelector* result,
                                 UPropsVectors *upvec,
                                 const USet* excludedCodePoints,
                                 const UConverterUnicodeSet whichSet,
                                 UErrorCode* status) {
  if (U_FAILURE(*status)) {
    return;
  }

  int32_t columns = (result->encodingsCount+31)/32;

  // set errorValue to all-ones
  for (int32_t col = 0; col < columns; col++) {
    upvec_setValue(upvec, UPVEC_ERROR_VALUE_CP, UPVEC_ERROR_VALUE_CP,
                   col, static_cast<uint32_t>(~0), static_cast<uint32_t>(~0), status);
  }

  for (int32_t i = 0; i < result->encodingsCount; ++i) {
    uint32_t mask;
    uint32_t column;
    int32_t item_count;
    int32_t j;
    UConverter* test_converter = ucnv_open(result->encodings[i], status);
    if (U_FAILURE(*status)) {
      return;
    }
    USet* unicode_point_set;
    unicode_point_set = uset_open(1, 0);  // empty set

    ucnv_getUnicodeSet(test_converter, unicode_point_set,
                       whichSet, status);
    if (U_FAILURE(*status)) {
      ucnv_close(test_converter);
      return;
    }

    column = i / 32;
    mask = 1 << (i%32);
    // now iterate over intervals on set i!
    item_count = uset_getItemCount(unicode_point_set);

    for (j = 0; j < item_count; ++j) {
      UChar32 start_char;
      UChar32 end_char;
      UErrorCode smallStatus = U_ZERO_ERROR;
      uset_getItem(unicode_point_set, j, &start_char, &end_char, NULL, 0,
                   &smallStatus);
      if (U_FAILURE(smallStatus)) {
        // this will be reached for the converters that fill the set with
        // strings. Those should be ignored by our system
      } else {
        upvec_setValue(upvec, start_char, end_char, column, static_cast<uint32_t>(~0), mask,
                       status);
      }
    }
    ucnv_close(test_converter);
    uset_close(unicode_point_set);
    if (U_FAILURE(*status)) {
      return;
    }
  }

  // handle excluded encodings! Simply set their values to all 1's in the upvec
  if (excludedCodePoints) {
    int32_t item_count = uset_getItemCount(excludedCodePoints);
    for (int32_t j = 0; j < item_count; ++j) {
      UChar32 start_char;
      UChar32 end_char;

      uset_getItem(excludedCodePoints, j, &start_char, &end_char, NULL, 0,
                   status);
      for (int32_t col = 0; col < columns; col++) {
        upvec_setValue(upvec, start_char, end_char, col, static_cast<uint32_t>(~0), static_cast<uint32_t>(~0),
                      status);
      }
    }
  }

  // alright. Now, let's put things in the same exact form you'd get when you
  // unserialize things.
  result->trie = upvec_compactToUTrie2WithRowIndexes(upvec, status);
  result->pv = upvec_cloneArray(upvec, &result->pvCount, NULL, status);
  result->pvCount *= columns;  // number of uint32_t = rows * columns
  result->ownPv = TRUE;
}
Exemplo n.º 4
0
void SSearchTest::monkeyTest(char *params)
{
    // ook!
    UErrorCode status = U_ZERO_ERROR;
  //UCollator *coll = ucol_open(NULL, &status);
    UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);

    if (U_FAILURE(status)) {
        errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
        return;
    }

    CollData  *monkeyData = new CollData(coll, status);

    USet *expansions   = uset_openEmpty();
    USet *contractions = uset_openEmpty();

    ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);

    U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
    U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
    USet *letters = uset_openPattern(letter_pattern, 39, &status);
    SetMonkey letterMonkey(letters);
    StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
    StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
    UnicodeString testCase;
    UnicodeString alternate;
    UnicodeString pattern, altPattern;
    UnicodeString prefix, altPrefix;
    UnicodeString suffix, altSuffix;

    Monkey *monkeys[] = {
        &letterMonkey,
        &contractionMonkey,
        &expansionMonkey,
        &contractionMonkey,
        &expansionMonkey,
        &contractionMonkey,
        &expansionMonkey,
        &contractionMonkey,
        &expansionMonkey};
    int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
    // int32_t nonMatchCount = 0;

    UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
    const char *strengthNames[] = {"primary", "secondary", "tertiary"};
    int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
    int32_t loopCount = quick? 1000 : 10000;
    int32_t firstStrength = 0;
    int32_t lastStrength  = strengthCount - 1; //*/ 0;

    if (params != NULL) {
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
        UnicodeString p(params);

        loopCount = getIntParam("loop", p, loopCount);
        m_seed    = getIntParam("seed", p, m_seed);

        RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
        if (m.find()) {
            UnicodeString breakType = m.group(1, status);

            for (int32_t s = 0; s < strengthCount; s += 1) {
                if (breakType == strengthNames[s]) {
                    firstStrength = lastStrength = s;
                    break;
                }
            }

            m.reset();
            p = m.replaceFirst("", status);
        }

        if (RegexMatcher("\\S", p, 0, status).find()) {
            // Each option is stripped out of the option string as it is processed.
            // All options have been checked.  The option string should have been completely emptied..
            char buf[100];
            p.extract(buf, sizeof(buf), NULL, status);
            buf[sizeof(buf)-1] = 0;
            errln("Unrecognized or extra parameter:  %s\n", buf);
            return;
        }
#else
        infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
#endif
    }

    for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
        int32_t notFoundCount = 0;

        logln("Setting strength to %s.", strengthNames[s]);
        ucol_setStrength(coll, strengths[s]);

        // TODO: try alternate prefix and suffix too?
        // TODO: alterntaes are only equal at primary strength. Is this OK?
        for(int32_t t = 0; t < loopCount; t += 1) {
            uint32_t seed = m_seed;
            // int32_t  nmc = 0;

            generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
            generateTestCase(coll, monkeys, monkeyCount, prefix,  altPrefix);
            generateTestCase(coll, monkeys, monkeyCount, suffix,  altSuffix);

            // pattern
            notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed);

            testCase.remove();
            testCase.append(prefix);
            testCase.append(/*alt*/pattern);

            // prefix + pattern
            notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed);

            testCase.append(suffix);

            // prefix + pattern + suffix
            notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed);

            testCase.remove();
            testCase.append(pattern);
            testCase.append(suffix);

            // pattern + suffix
            notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
        }

       logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
    }

    uset_close(contractions);
    uset_close(expansions);
    uset_close(letters);
    delete monkeyData;

    ucol_close(coll);
}
Exemplo n.º 5
0
CollData::CollData(UCollator *collator, UErrorCode &status)
    : coll(NULL), ceToCharsStartingWith(NULL)
{
    // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
    // i.e. other, control, private use, format, surrogate
    U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
    U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
    USet *charsToTest = uset_openPattern(test_pattern, 20, &status);

    // Han ext. A, Han, Jamo, Hangul, Han Ext. B
    // i.e. all the characers we handle implicitly
    U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
    U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
    USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);

    if (U_FAILURE(status)) {
        return;
    }

    USet *expansions   = uset_openEmpty();
    USet *contractions = uset_openEmpty();
    int32_t itemCount;

    ceToCharsStartingWith = new CEToStringsMap(status);

    if (U_FAILURE(status)) {
        goto bail;
    }

#ifdef CLONE_COLLATOR
    coll = ucol_safeClone(collator, NULL, NULL, &status);

    if (U_FAILURE(status)) {
        goto bail;
    }
#else
    coll = collator;
#endif

    ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);

    uset_addAll(charsToTest, contractions);
    uset_addAll(charsToTest, expansions);
    uset_removeAll(charsToTest, charsToRemove);

    itemCount = uset_getItemCount(charsToTest);
    for(int32_t item = 0; item < itemCount; item += 1) {
        UChar32 start = 0, end = 0;
        UChar buffer[16];
        int32_t len = uset_getItem(charsToTest, item, &start, &end,
                                   buffer, 16, &status);

        if (len == 0) {
            for (UChar32 ch = start; ch <= end; ch += 1) {
                UnicodeString *st = new UnicodeString(ch);

                if (st == NULL) {
                    status = U_MEMORY_ALLOCATION_ERROR;
                    break;
                }

                CEList *ceList = new CEList(coll, *st, status);

                ceToCharsStartingWith->put(ceList->get(0), st, status);

                delete ceList;
                delete st;
            }
        } else if (len > 0) {
            UnicodeString *st = new UnicodeString(buffer, len);

            if (st == NULL) {
                status = U_MEMORY_ALLOCATION_ERROR;
                break;
            }

            CEList *ceList = new CEList(coll, *st, status);

            ceToCharsStartingWith->put(ceList->get(0), st, status);

            delete ceList;
            delete st;
        } else {
            // shouldn't happen...
        }

        if (U_FAILURE(status)) {
             break;
        }
    }

bail:
    uset_close(contractions);
    uset_close(expansions);
    uset_close(charsToRemove);
    uset_close(charsToTest);

    if (U_FAILURE(status)) {
        return;
    }

    UnicodeSet hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status);
    if (U_FAILURE(status)) {
        return;
    }
    UnicodeSetIterator hanIter(hanRanges);
    UnicodeString hanString;
    while(hanIter.nextRange()) {
        hanString.append(hanIter.getCodepoint());
        hanString.append(hanIter.getCodepointEnd());
    }
    // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9,
    // but as of Unicode 6.3 the 11xx block is filled,
    // and there are also more Jamo T at U+D7CB..U+D7FB.
    // Maybe use [:HST=T:] and look for the end of the last range?
    // Maybe use script boundary mappings instead of this code??
    UChar  jamoRanges[] = {Hangul::JAMO_L_BASE, Hangul::JAMO_V_BASE, Hangul::JAMO_T_BASE + 1, 0x11FF};
     UnicodeString jamoString(FALSE, jamoRanges, UPRV_LENGTHOF(jamoRanges));
     CEList hanList(coll, hanString, status);
     CEList jamoList(coll, jamoString, status);
     int32_t j = 0;

     if (U_FAILURE(status)) {
         return;
     }

     for (int32_t c = 0; c < jamoList.size(); c += 1) {
         uint32_t jce = jamoList[c];

         if (! isContinuation(jce)) {
             jamoLimits[j++] = jce;
         }
     }

     jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);

     minHan = 0xFFFFFFFF;
     maxHan = 0;

     for(int32_t h = 0; h < hanList.size(); h += 2) {
         uint32_t han = (uint32_t) hanList[h];

         if (han < minHan) {
             minHan = han;
         }

         if (han > maxHan) {
             maxHan = han;
         }
     }

     maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
}
Exemplo n.º 6
0
static void TestUSpoofCAPI(void) {

    /*
     *  basic uspoof_open().
     */
    {
        USpoofChecker *sc;
        UErrorCode  status = U_ZERO_ERROR;
        sc = uspoof_open(&status);
        TEST_ASSERT_SUCCESS(status);
        if (U_FAILURE(status)) {
            /* If things are so broken that we can't even open a default spoof checker,  */
            /*   don't even try the rest of the tests.  They would all fail.             */
            return;
        }
        uspoof_close(sc);
    }

    
        
    /*
     *  Test Open from source rules.
    */
    TEST_SETUP
    const char *dataSrcDir;
    char       *fileName;
    char       *confusables;
    int         confusablesLength;
    char       *confusablesWholeScript;
    int         confusablesWholeScriptLength;
    FILE       *f;
    UParseError pe;
    int32_t     errType;
    USpoofChecker *rsc;
    
    dataSrcDir = ctest_dataSrcDir();
    fileName = malloc(strlen(dataSrcDir) + 100);
    strcpy(fileName, dataSrcDir);
    strcat(fileName, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "confusables.txt");
    f = fopen(fileName, "r");
    TEST_ASSERT_NE(f, NULL);
    confusables = malloc(3000000);
    confusablesLength = fread(confusables, 1, 3000000, f);
    fclose(f);

    
    strcpy(fileName, dataSrcDir);
    strcat(fileName, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "confusablesWholeScript.txt");
    f = fopen(fileName, "r");
    TEST_ASSERT_NE(f, NULL);
    confusablesWholeScript = malloc(1000000);
    confusablesWholeScriptLength = fread(confusablesWholeScript, 1, 1000000, f);
    fclose(f);

    rsc = uspoof_openFromSource(confusables, confusablesLength,
                                              confusablesWholeScript, confusablesWholeScriptLength,
                                              &errType, &pe, &status);
    TEST_ASSERT_SUCCESS(status);

    free(confusablesWholeScript);
    free(confusables);
    free(fileName);
    uspoof_close(rsc);
    /*  printf("ParseError Line is %d\n", pe.line);  */
    TEST_TEARDOWN;


    /*
     * openFromSerialized and serialize
    */
    TEST_SETUP
        int32_t        serializedSize = 0;
        int32_t        actualLength = 0;
        char           *buf;
        USpoofChecker  *sc2;
        int32_t         checkResults;

        
        serializedSize = uspoof_serialize(sc, NULL, 0, &status);
        TEST_ASSERT_EQ(status, U_BUFFER_OVERFLOW_ERROR);
        TEST_ASSERT(serializedSize > 0);

        /* Serialize the default spoof checker */
        status = U_ZERO_ERROR;
        buf = (char *)malloc(serializedSize + 10);
        TEST_ASSERT(buf != NULL);
        buf[serializedSize] = 42;
        uspoof_serialize(sc, buf, serializedSize, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(42, buf[serializedSize]);

        /* Create a new spoof checker from the freshly serialized data */
        sc2 = uspoof_openFromSerialized(buf, serializedSize+10, &actualLength, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_NE(NULL, sc2);
        TEST_ASSERT_EQ(serializedSize, actualLength);

        /* Verify that the new spoof checker at least wiggles */
        checkResults = uspoof_check(sc2, goodLatin, -1, NULL, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(0, checkResults);

        checkResults = uspoof_check(sc2, scMixed, -1, NULL, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);

        uspoof_close(sc2);
        free(buf);
    TEST_TEARDOWN;
        
        
        
    /*
     * Set & Get Check Flags
    */
    TEST_SETUP
        int32_t t;
        uspoof_setChecks(sc, USPOOF_ALL_CHECKS, &status);
        TEST_ASSERT_SUCCESS(status);
        t = uspoof_getChecks(sc, &status);
        TEST_ASSERT_EQ(t, USPOOF_ALL_CHECKS);
    
        uspoof_setChecks(sc, 0, &status);
        TEST_ASSERT_SUCCESS(status);
        t = uspoof_getChecks(sc, &status);
        TEST_ASSERT_EQ(0, t);
        
        uspoof_setChecks(sc,
                        USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE,
                        &status);
        TEST_ASSERT_SUCCESS(status);
        t = uspoof_getChecks(sc, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE, t);
    TEST_TEARDOWN;

    /*
    * get & setAllowedChars
    */
    TEST_SETUP
        USet *us;
        const USet *uset;

        uset = uspoof_getAllowedChars(sc, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT(uset_isFrozen(uset));
        us = uset_open((UChar32)0x41, (UChar32)0x5A);   /*  [A-Z]  */
        uspoof_setAllowedChars(sc, us, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_NE(us, uspoof_getAllowedChars(sc, &status));
        TEST_ASSERT(uset_equals(us, uspoof_getAllowedChars(sc, &status)));
        TEST_ASSERT_SUCCESS(status);
        uset_close(us);
    TEST_TEARDOWN;

    /*
    *  clone()
    */

    TEST_SETUP
        USpoofChecker *clone1 = NULL;
        USpoofChecker *clone2 = NULL;
        int32_t        checkResults = 0;
        
        clone1 = uspoof_clone(sc, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_NE(clone1, sc);

        clone2 = uspoof_clone(clone1, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_NE(clone2, clone1);

        uspoof_close(clone1);
        
        /* Verify that the cloned spoof checker is alive */
        checkResults = uspoof_check(clone2, goodLatin, -1, NULL, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(0, checkResults);

        checkResults = uspoof_check(clone2, scMixed, -1, NULL, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
        uspoof_close(clone2);
    TEST_TEARDOWN;

    /*
     *  get & set Checks
    */
    TEST_SETUP
        int32_t   checks;
        int32_t   checks2;
        int32_t   checkResults;

        checks = uspoof_getChecks(sc, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(USPOOF_ALL_CHECKS, checks);

        checks &= ~(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE);
        uspoof_setChecks(sc, checks, &status);
        TEST_ASSERT_SUCCESS(status);
        checks2 = uspoof_getChecks(sc, &status);
        TEST_ASSERT_EQ(checks, checks2);

        /* The checks that were disabled just above are the same ones that the "scMixed" test fails.
            So with those tests gone checking that Identifier should now succeed */
        checkResults = uspoof_check(sc, scMixed, -1, NULL, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(0, checkResults);
    TEST_TEARDOWN;
        
    /*
     *  AllowedLoacles
     */

    TEST_SETUP
        const char  *allowedLocales;
        int32_t  checkResults;

        /* Default allowed locales list should be empty */
        allowedLocales = uspoof_getAllowedLocales(sc, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT(strcmp("", allowedLocales) == 0)

        /* Allow en and ru, which should enable Latin and Cyrillic only to pass */
        uspoof_setAllowedLocales(sc, "en, ru_RU", &status);
        TEST_ASSERT_SUCCESS(status);
        allowedLocales = uspoof_getAllowedLocales(sc, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT(strstr(allowedLocales, "en") != NULL);
        TEST_ASSERT(strstr(allowedLocales, "ru") != NULL);

        /* Limit checks to USPOOF_CHAR_LIMIT.  Some of the test data has whole script confusables also,
         * which we don't want to see in this test. */
        uspoof_setChecks(sc, USPOOF_CHAR_LIMIT, &status);
        TEST_ASSERT_SUCCESS(status);

        checkResults = uspoof_check(sc, goodLatin, -1, NULL, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(0, checkResults);
        
        checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT, checkResults);

        checkResults = uspoof_check(sc, goodCyrl, -1, NULL, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(0, checkResults);

        /* Reset with an empty locale list, which should allow all characters to pass */
        uspoof_setAllowedLocales(sc, " ", &status);
        TEST_ASSERT_SUCCESS(status);

        checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(0, checkResults);
    TEST_TEARDOWN;

    /*
     * AllowedChars   set/get the USet of allowed characters.
     */
    TEST_SETUP
        const USet  *set;
        USet        *tmpSet;
        int32_t      checkResults;
        
        /* By default, we should see no restriction; the USet should allow all characters. */
        set = uspoof_getAllowedChars(sc, &status);
        TEST_ASSERT_SUCCESS(status);
        tmpSet = uset_open(0, 0x10ffff);
        TEST_ASSERT(uset_equals(tmpSet, set));

        /* Setting the allowed chars should enable the check. */
        uspoof_setChecks(sc, USPOOF_ALL_CHECKS & ~USPOOF_CHAR_LIMIT, &status);
        TEST_ASSERT_SUCCESS(status);

        /* Remove a character that is in our good Latin test identifier from the allowed chars set. */
        uset_remove(tmpSet, goodLatin[1]);
        uspoof_setAllowedChars(sc, tmpSet, &status);
        TEST_ASSERT_SUCCESS(status);
        uset_close(tmpSet);

        /* Latin Identifier should now fail; other non-latin test cases should still be OK */
        checkResults = uspoof_check(sc, goodLatin, -1, NULL, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT, checkResults);

        checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);
    TEST_TEARDOWN;

    /*
     * check UTF-8
     */
    TEST_SETUP
        char    utf8buf[200];
        int32_t checkResults;
        int32_t position;

        u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodLatin, -1, &status);
        TEST_ASSERT_SUCCESS(status);
        position = 666;
        checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(0, checkResults);
        TEST_ASSERT_EQ(666, position);

        u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodCyrl, -1, &status);
        TEST_ASSERT_SUCCESS(status);
        checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(0, checkResults);

        u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, scMixed, -1, &status);
        TEST_ASSERT_SUCCESS(status);
        position = 666;
        checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_SINGLE_SCRIPT , checkResults);
        TEST_ASSERT_EQ(2, position);

    TEST_TEARDOWN;

    /*
     * uspoof_areConfusable()
     */
    TEST_SETUP
        int32_t  checkResults;
        
        checkResults = uspoof_areConfusable(sc, scLatin, -1, scMixed, -1, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);

        checkResults = uspoof_areConfusable(sc, goodGreek, -1, scLatin, -1, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(0, checkResults);

        checkResults = uspoof_areConfusable(sc, lll_Latin_a, -1, lll_Latin_b, -1, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults);

    TEST_TEARDOWN;

    /*
     * areConfusableUTF8
     */
    TEST_SETUP
        int32_t checkResults;
        char s1[200];
        char s2[200];


        u_strToUTF8(s1, sizeof(s1), NULL, scLatin, -1, &status);
        u_strToUTF8(s2, sizeof(s2), NULL, scMixed, -1, &status);
        TEST_ASSERT_SUCCESS(status);
        checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);

        u_strToUTF8(s1, sizeof(s1), NULL, goodGreek, -1, &status);
        u_strToUTF8(s2, sizeof(s2), NULL, scLatin, -1, &status);
        TEST_ASSERT_SUCCESS(status);
        checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(0, checkResults);
        
        u_strToUTF8(s1, sizeof(s1), NULL, lll_Latin_a, -1, &status);
        u_strToUTF8(s2, sizeof(s2), NULL, lll_Latin_b, -1, &status);
        TEST_ASSERT_SUCCESS(status);
        checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults);

    TEST_TEARDOWN;


  /*
   * getSkeleton
   */

    TEST_SETUP
        UChar dest[100];
        int32_t   skelLength;

        skelLength = uspoof_getSkeleton(sc, USPOOF_ANY_CASE, lll_Latin_a, -1, dest, sizeof(dest)/sizeof(UChar), &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(0, u_strcmp(lll_Skel, dest));
        TEST_ASSERT_EQ(u_strlen(lll_Skel), skelLength);

        skelLength = uspoof_getSkeletonUTF8(sc, USPOOF_ANY_CASE, goodLatinUTF8, -1, dest, sizeof(dest)/sizeof(UChar), &status);
        TEST_ASSERT_SUCCESS(status);

        skelLength = uspoof_getSkeleton(sc, USPOOF_ANY_CASE, lll_Latin_a, -1, NULL, 0, &status);
        TEST_ASSERT_EQ(U_BUFFER_OVERFLOW_ERROR, status);
        TEST_ASSERT_EQ(3, skelLength);
        status = U_ZERO_ERROR;

    TEST_TEARDOWN;
}
Exemplo n.º 7
0
static void TestSelector()
{
  TestText text;
  USet* excluded_sets[3] = { NULL };
  int32_t i, testCaseIdx;

  if (!getAvailableNames()) {
    return;
  }
  if (!text_open(&text)) {
    releaseAvailableNames();;
  }

  excluded_sets[0] = uset_openEmpty();
  for(i = 1 ; i < 3 ; i++) {
    excluded_sets[i] = uset_open(i*30, i*30+500);
  }

  for(testCaseIdx = 0; testCaseIdx < UPRV_LENGTHOF(getEncodingsFns); testCaseIdx++)
  {
    int32_t excluded_set_id;
    int32_t num_encodings;
    const char **encodings = getEncodingsFns[testCaseIdx](&num_encodings);
    if (getTestOption(QUICK_OPTION) && num_encodings > 25) {
      uprv_free((void *)encodings);
      continue;
    }

    /*
     * for(excluded_set_id = 0 ; excluded_set_id < 3 ; excluded_set_id++)
     *
     * This loop was replaced by the following statement because
     * the loop made the test run longer without adding to the code coverage.
     * The handling of the exclusion set is independent of the
     * set of encodings, so there is no need to test every combination.
     */
    excluded_set_id = testCaseIdx % UPRV_LENGTHOF(excluded_sets);
    {
      UConverterSelector *sel_rt, *sel_fb;
      char *buffer_fb = NULL;
      UErrorCode status = U_ZERO_ERROR;
      sel_rt = ucnvsel_open(encodings, num_encodings,
                            excluded_sets[excluded_set_id],
                            UCNV_ROUNDTRIP_SET, &status);
      if (num_encodings == gCountAvailable) {
        /* test the special "all converters" parameter values */
        sel_fb = ucnvsel_open(NULL, 0,
                              excluded_sets[excluded_set_id],
                              UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status);
      } else if (uset_isEmpty(excluded_sets[excluded_set_id])) {
        /* test that a NULL set gives the same results as an empty set */
        sel_fb = ucnvsel_open(encodings, num_encodings,
                              NULL,
                              UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status);
      } else {
        sel_fb = ucnvsel_open(encodings, num_encodings,
                              excluded_sets[excluded_set_id],
                              UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status);
      }
      if (U_FAILURE(status)) {
        log_err("ucnv_sel_open(encodings %ld) failed - %s\n", testCaseIdx, u_errorName(status));
        ucnvsel_close(sel_rt);
        uprv_free((void *)encodings);
        continue;
      }

      text_reset(&text);
      for (;;) {
        UBool *manual_rt, *manual_fb;
        static UChar utf16[10000];
        char *s;
        int32_t length8, length16;

        s = text_nextString(&text, &length8);
        if (s == NULL || (getTestOption(QUICK_OPTION) && text.number > 3)) {
          break;
        }

        manual_rt = getResultsManually(encodings, num_encodings,
                                       s, length8,
                                       excluded_sets[excluded_set_id],
                                       UCNV_ROUNDTRIP_SET);
        manual_fb = getResultsManually(encodings, num_encodings,
                                       s, length8,
                                       excluded_sets[excluded_set_id],
                                       UCNV_ROUNDTRIP_AND_FALLBACK_SET);
        /* UTF-8 with length */
        status = U_ZERO_ERROR;
        verifyResult(ucnvsel_selectForUTF8(sel_rt, s, length8, &status), manual_rt);
        verifyResult(ucnvsel_selectForUTF8(sel_fb, s, length8, &status), manual_fb);
        /* UTF-8 NUL-terminated */
        verifyResult(ucnvsel_selectForUTF8(sel_rt, s, -1, &status), manual_rt);
        verifyResult(ucnvsel_selectForUTF8(sel_fb, s, -1, &status), manual_fb);

        u_strFromUTF8(utf16, UPRV_LENGTHOF(utf16), &length16, s, length8, &status);
        if (U_FAILURE(status)) {
          log_err("error converting the test text (string %ld) to UTF-16 - %s\n",
                  (long)text.number, u_errorName(status));
        } else {
          if (text.number == 0) {
            sel_fb = serializeAndUnserialize(sel_fb, &buffer_fb, &status);
          }
          if (U_SUCCESS(status)) {
            /* UTF-16 with length */
            verifyResult(ucnvsel_selectForString(sel_rt, utf16, length16, &status), manual_rt);
            verifyResult(ucnvsel_selectForString(sel_fb, utf16, length16, &status), manual_fb);
            /* UTF-16 NUL-terminated */
            verifyResult(ucnvsel_selectForString(sel_rt, utf16, -1, &status), manual_rt);
            verifyResult(ucnvsel_selectForString(sel_fb, utf16, -1, &status), manual_fb);
          }
        }

        uprv_free(manual_rt);
        uprv_free(manual_fb);
      }
      ucnvsel_close(sel_rt);
      ucnvsel_close(sel_fb);
      uprv_free(buffer_fb);
    }
    uprv_free((void *)encodings);
  }

  releaseAvailableNames();
  text_close(&text);
  for(i = 0 ; i < 3 ; i++) {
    uset_close(excluded_sets[i]);
  }
}
Exemplo n.º 8
0
/**
 * Basic API test for uset.x
 */
static void TestAPI() {
    USet* set;
    USet* set2;
    UErrorCode ec;
    
    /* [] */
    set = uset_openEmpty();
    expect(set, "", "abc{ab}", NULL);
    uset_close(set);

    set = uset_open(1, 0);
    expect(set, "", "abc{ab}", NULL);
    uset_close(set);

    set = uset_open(1, 1);
    uset_clear(set);
    expect(set, "", "abc{ab}", NULL);
    uset_close(set);

    /* [ABC] */
    set = uset_open(0x0041, 0x0043);
    expect(set, "ABC", "DEF{ab}", NULL);
    uset_close(set);

    /* [a-c{ab}] */
    ec = U_ZERO_ERROR;
    set = uset_openPattern(PAT, PAT_LEN, &ec);
    if(U_FAILURE(ec)) {
        log_err("uset_openPattern([a-c{ab}]) failed - %s\n", u_errorName(ec));
        return;
    }
    if(!uset_resemblesPattern(PAT, PAT_LEN, 0)) {
        log_err("uset_resemblesPattern of PAT failed\n");
    }
    expect(set, "abc{ab}", "def{bc}", &ec);

    /* [a-d{ab}] */
    uset_add(set, 0x64);
    expect(set, "abcd{ab}", "ef{bc}", NULL);

    /* [acd{ab}{bc}] */
    uset_remove(set, 0x62);
    uset_addString(set, STR_bc, STR_bc_LEN);
    expect(set, "acd{ab}{bc}", "bef{cd}", NULL);

    /* [acd{bc}] */
    uset_removeString(set, STR_ab, STR_ab_LEN);
    expect(set, "acd{bc}", "bfg{ab}", NULL);

    /* [^acd{bc}] */
    uset_complement(set);
    expect(set, "bef{bc}", "acd{ac}", NULL);

    /* [a-e{bc}] */
    uset_complement(set);
    uset_addRange(set, 0x0062, 0x0065);
    expect(set, "abcde{bc}", "fg{ab}", NULL);

    /* [de{bc}] */
    uset_removeRange(set, 0x0050, 0x0063);
    expect(set, "de{bc}", "bcfg{ab}", NULL);

    /* [g-l] */
    uset_set(set, 0x0067, 0x006C);
    expect(set, "ghijkl", "de{bc}", NULL);

    if (uset_indexOf(set, 0x0067) != 0) {
        log_err("uset_indexOf failed finding correct index of 'g'\n");
    }

    if (uset_charAt(set, 0) != 0x0067) {
        log_err("uset_charAt failed finding correct char 'g' at index 0\n");
    }

    /* How to test this one...? */
    uset_compact(set);

    /* [g-i] */
    uset_retain(set, 0x0067, 0x0069);
    expect(set, "ghi", "dejkl{bc}", NULL);

    /* UCHAR_ASCII_HEX_DIGIT */
    uset_applyIntPropertyValue(set, UCHAR_ASCII_HEX_DIGIT, 1, &ec);
    if(U_FAILURE(ec)) {
        log_err("uset_applyIntPropertyValue([UCHAR_ASCII_HEX_DIGIT]) failed - %s\n", u_errorName(ec));
        return;
    }
    expect(set, "0123456789ABCDEFabcdef", "GHIjkl{bc}", NULL);

    /* [ab] */
    uset_clear(set);
    uset_addAllCodePoints(set, STR_ab, STR_ab_LEN);
    expect(set, "ab", "def{ab}", NULL);
    if (uset_containsAllCodePoints(set, STR_bc, STR_bc_LEN)){
        log_err("set should not conatin all characters of \"bc\" \n");
    }

    /* [] */
    set2 = uset_open(1, 1);
    uset_clear(set2);

    /* space */
    uset_applyPropertyAlias(set2, PAT_lb, PAT_lb_LEN, VAL_SP, VAL_SP_LEN, &ec);
    expect(set2, " ", "abcdefghi{bc}", NULL);

    /* [a-c] */
    uset_set(set2, 0x0061, 0x0063);
    /* [g-i] */
    uset_set(set, 0x0067, 0x0069);

    /* [a-c g-i] */
    if (uset_containsSome(set, set2)) {
        log_err("set should not contain some of set2 yet\n");
    }
    uset_complementAll(set, set2);
    if (!uset_containsSome(set, set2)) {
        log_err("set should contain some of set2\n");
    }
    expect(set, "abcghi", "def{bc}", NULL);

    /* [g-i] */
    uset_removeAll(set, set2);
    expect(set, "ghi", "abcdef{bc}", NULL);

    /* [a-c g-i] */
    uset_addAll(set2, set);
    expect(set2, "abcghi", "def{bc}", NULL);

    /* [g-i] */
    uset_retainAll(set2, set);
    expect(set2, "ghi", "abcdef{bc}", NULL);

    uset_close(set);
    uset_close(set2);
}
Exemplo n.º 9
0
/*
 *   Spoof Detection C API Tests
 */
static void TestUSpoofCAPI(void) {

    /*
     *  basic uspoof_open().
     */
    {
        USpoofChecker *sc;
        UErrorCode  status = U_ZERO_ERROR;
        sc = uspoof_open(&status);
        TEST_ASSERT_SUCCESS(status);
        if (U_FAILURE(status)) {
            /* If things are so broken that we can't even open a default spoof checker,  */
            /*   don't even try the rest of the tests.  They would all fail.             */
            return;
        }
        uspoof_close(sc);
    }

    /*
     * openFromSerialized and serialize
    */
    TEST_SETUP
    int32_t        serializedSize = 0;
    int32_t        actualLength = 0;
    char           *buf;
    USpoofChecker  *sc2;
    int32_t         checkResults;


    serializedSize = uspoof_serialize(sc, NULL, 0, &status);
    TEST_ASSERT_EQ(status, U_BUFFER_OVERFLOW_ERROR);
    TEST_ASSERT(serializedSize > 0);

    /* Serialize the default spoof checker */
    status = U_ZERO_ERROR;
    buf = (char *)malloc(serializedSize + 10);
    TEST_ASSERT(buf != NULL);
    buf[serializedSize] = 42;
    uspoof_serialize(sc, buf, serializedSize, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(42, buf[serializedSize]);

    /* Create a new spoof checker from the freshly serialized data */
    sc2 = uspoof_openFromSerialized(buf, serializedSize+10, &actualLength, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_NE(NULL, sc2);
    TEST_ASSERT_EQ(serializedSize, actualLength);

    /* Verify that the new spoof checker at least wiggles */
    checkResults = uspoof_check(sc2, goodLatin, -1, NULL, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(0, checkResults);

    checkResults = uspoof_check(sc2, scMixed, -1, NULL, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);

    uspoof_close(sc2);
    free(buf);
    TEST_TEARDOWN;



    /*
     * Set & Get Check Flags
    */
    TEST_SETUP
    int32_t t;
    uspoof_setChecks(sc, USPOOF_ALL_CHECKS, &status);
    TEST_ASSERT_SUCCESS(status);
    t = uspoof_getChecks(sc, &status);
    TEST_ASSERT_EQ(t, USPOOF_ALL_CHECKS);

    uspoof_setChecks(sc, 0, &status);
    TEST_ASSERT_SUCCESS(status);
    t = uspoof_getChecks(sc, &status);
    TEST_ASSERT_EQ(0, t);

    uspoof_setChecks(sc,
                     USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE,
                     &status);
    TEST_ASSERT_SUCCESS(status);
    t = uspoof_getChecks(sc, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE, t);
    TEST_TEARDOWN;

    /*
    * get & setAllowedChars
    */
    TEST_SETUP
    USet *us;
    const USet *uset;

    uset = uspoof_getAllowedChars(sc, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT(uset_isFrozen(uset));
    us = uset_open((UChar32)0x41, (UChar32)0x5A);   /*  [A-Z]  */
    uspoof_setAllowedChars(sc, us, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_NE(us, uspoof_getAllowedChars(sc, &status));
    TEST_ASSERT(uset_equals(us, uspoof_getAllowedChars(sc, &status)));
    TEST_ASSERT_SUCCESS(status);
    uset_close(us);
    TEST_TEARDOWN;

    /*
    *  clone()
    */

    TEST_SETUP
    USpoofChecker *clone1 = NULL;
    USpoofChecker *clone2 = NULL;
    int32_t        checkResults = 0;

    clone1 = uspoof_clone(sc, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_NE(clone1, sc);

    clone2 = uspoof_clone(clone1, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_NE(clone2, clone1);

    uspoof_close(clone1);

    /* Verify that the cloned spoof checker is alive */
    checkResults = uspoof_check(clone2, goodLatin, -1, NULL, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(0, checkResults);

    checkResults = uspoof_check(clone2, scMixed, -1, NULL, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
    uspoof_close(clone2);
    TEST_TEARDOWN;

    /*
    *  basic uspoof_check()
    */
    TEST_SETUP
    int32_t result;
    result = uspoof_check(sc, goodLatin, -1, NULL, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(0, result);

    result = uspoof_check(sc, han_Hiragana, -1, NULL, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(0, result);

    result = uspoof_check(sc, scMixed, -1, NULL, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, result);
    TEST_TEARDOWN


    /*
     *  get & set Checks
    */
    TEST_SETUP
    int32_t   checks;
    int32_t   checks2;
    int32_t   checkResults;

    checks = uspoof_getChecks(sc, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(USPOOF_ALL_CHECKS, checks);

    checks &= ~(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE);
    uspoof_setChecks(sc, checks, &status);
    TEST_ASSERT_SUCCESS(status);
    checks2 = uspoof_getChecks(sc, &status);
    TEST_ASSERT_EQ(checks, checks2);

    /* The checks that were disabled just above are the same ones that the "scMixed" test fails.
        So with those tests gone checking that Identifier should now succeed */
    checkResults = uspoof_check(sc, scMixed, -1, NULL, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(0, checkResults);
    TEST_TEARDOWN;

    /*
     *  AllowedLoacles
     */

    TEST_SETUP
    const char  *allowedLocales;
    int32_t  checkResults;

    /* Default allowed locales list should be empty */
    allowedLocales = uspoof_getAllowedLocales(sc, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT(strcmp("", allowedLocales) == 0)

    /* Allow en and ru, which should enable Latin and Cyrillic only to pass */
    uspoof_setAllowedLocales(sc, "en, ru_RU", &status);
    TEST_ASSERT_SUCCESS(status);
    allowedLocales = uspoof_getAllowedLocales(sc, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT(strstr(allowedLocales, "en") != NULL);
    TEST_ASSERT(strstr(allowedLocales, "ru") != NULL);

    /* Limit checks to USPOOF_CHAR_LIMIT.  Some of the test data has whole script confusables also,
     * which we don't want to see in this test. */
    uspoof_setChecks(sc, USPOOF_CHAR_LIMIT, &status);
    TEST_ASSERT_SUCCESS(status);

    checkResults = uspoof_check(sc, goodLatin, -1, NULL, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(0, checkResults);

    checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT, checkResults);

    checkResults = uspoof_check(sc, goodCyrl, -1, NULL, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(0, checkResults);

    /* Reset with an empty locale list, which should allow all characters to pass */
    uspoof_setAllowedLocales(sc, " ", &status);
    TEST_ASSERT_SUCCESS(status);

    checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(0, checkResults);
    TEST_TEARDOWN;

    /*
     * AllowedChars   set/get the USet of allowed characters.
     */
    TEST_SETUP
    const USet  *set;
    USet        *tmpSet;
    int32_t      checkResults;

    /* By default, we should see no restriction; the USet should allow all characters. */
    set = uspoof_getAllowedChars(sc, &status);
    TEST_ASSERT_SUCCESS(status);
    tmpSet = uset_open(0, 0x10ffff);
    TEST_ASSERT(uset_equals(tmpSet, set));

    /* Setting the allowed chars should enable the check. */
    uspoof_setChecks(sc, USPOOF_ALL_CHECKS & ~USPOOF_CHAR_LIMIT, &status);
    TEST_ASSERT_SUCCESS(status);

    /* Remove a character that is in our good Latin test identifier from the allowed chars set. */
    uset_remove(tmpSet, goodLatin[1]);
    uspoof_setAllowedChars(sc, tmpSet, &status);
    TEST_ASSERT_SUCCESS(status);
    uset_close(tmpSet);

    /* Latin Identifier should now fail; other non-latin test cases should still be OK
     *  Note: fail of CHAR_LIMIT also causes the restriction level to be USPOOF_UNRESTRICTIVE
     *        which will give us a USPOOF_RESTRICTION_LEVEL failure.
     */
    checkResults = uspoof_check(sc, goodLatin, -1, NULL, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT | USPOOF_RESTRICTION_LEVEL, checkResults);

    checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);
    TEST_TEARDOWN;

    /*
     * check UTF-8
     */
    TEST_SETUP
    char    utf8buf[200];
    int32_t checkResults;
    int32_t position;

    u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodLatin, -1, &status);
    TEST_ASSERT_SUCCESS(status);
    position = 666;
    checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(0, checkResults);
    TEST_ASSERT_EQ(0, position);

    u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodCyrl, -1, &status);
    TEST_ASSERT_SUCCESS(status);
    checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(0, checkResults);

    u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, scMixed, -1, &status);
    TEST_ASSERT_SUCCESS(status);
    position = 666;
    checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_SINGLE_SCRIPT , checkResults);
    TEST_ASSERT_EQ(0, position);

    TEST_TEARDOWN;

    /*
     * uspoof_areConfusable()
     */
    TEST_SETUP
    int32_t  checkResults;

    checkResults = uspoof_areConfusable(sc, scLatin, -1, scMixed, -1, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);

    checkResults = uspoof_areConfusable(sc, goodGreek, -1, scLatin, -1, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(0, checkResults);

    checkResults = uspoof_areConfusable(sc, lll_Latin_a, -1, lll_Latin_b, -1, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults);

    TEST_TEARDOWN;

    /*
     * areConfusableUTF8
     */
    TEST_SETUP
    int32_t checkResults;
    char s1[200];
    char s2[200];


    u_strToUTF8(s1, sizeof(s1), NULL, scLatin, -1, &status);
    u_strToUTF8(s2, sizeof(s2), NULL, scMixed, -1, &status);
    TEST_ASSERT_SUCCESS(status);
    checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);

    u_strToUTF8(s1, sizeof(s1), NULL, goodGreek, -1, &status);
    u_strToUTF8(s2, sizeof(s2), NULL, scLatin, -1, &status);
    TEST_ASSERT_SUCCESS(status);
    checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(0, checkResults);

    u_strToUTF8(s1, sizeof(s1), NULL, lll_Latin_a, -1, &status);
    u_strToUTF8(s2, sizeof(s2), NULL, lll_Latin_b, -1, &status);
    TEST_ASSERT_SUCCESS(status);
    checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults);

    TEST_TEARDOWN;


    /*
     * getSkeleton
     */

    TEST_SETUP
    UChar dest[100];
    int32_t   skelLength;

    skelLength = uspoof_getSkeleton(sc, USPOOF_ANY_CASE, lll_Latin_a, -1, dest, UPRV_LENGTHOF(dest), &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(0, u_strcmp(lll_Skel, dest));
    TEST_ASSERT_EQ(u_strlen(lll_Skel), skelLength);

    skelLength = uspoof_getSkeletonUTF8(sc, USPOOF_ANY_CASE, goodLatinUTF8, -1, (char*)dest,
                                        UPRV_LENGTHOF(dest), &status);
    TEST_ASSERT_SUCCESS(status);

    skelLength = uspoof_getSkeleton(sc, USPOOF_ANY_CASE, lll_Latin_a, -1, NULL, 0, &status);
    TEST_ASSERT_EQ(U_BUFFER_OVERFLOW_ERROR, status);
    TEST_ASSERT_EQ(3, skelLength);
    status = U_ZERO_ERROR;

    TEST_TEARDOWN;

    /*
     * get Inclusion and Recommended sets
     */
    TEST_SETUP
    const USet *inclusions = NULL;
    const USet *recommended = NULL;

    inclusions = uspoof_getInclusionSet(&status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(TRUE, uset_isFrozen(inclusions));

    status = U_ZERO_ERROR;
    recommended = uspoof_getRecommendedSet(&status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT_EQ(TRUE, uset_isFrozen(recommended));
    TEST_TEARDOWN;

}
Exemplo n.º 10
0
static int32_t
u_scanf_scanset_handler(UFILE       *input,
                        u_scanf_spec_info *info,
                        ufmt_args   *args,
                        const UChar *fmt,
                        int32_t     *fmtConsumed,
                        int32_t     *argConverted)
{
    USet        *scanset;
    UErrorCode  status = U_ZERO_ERROR;
    int32_t     chLeft = INT32_MAX;
    UChar32     c;
    UChar       *alias = (UChar*) (args[0].ptrValue);
    UBool       isNotEOF = FALSE;
    UBool       readCharacter = FALSE;

    /* Create an empty set */
    scanset = uset_open(0, -1);

    /* Back up one to get the [ */
    fmt--;

    /* truncate to the width, if specified and alias the target */
    if(info->fWidth >= 0) {
        chLeft = info->fWidth;
    }

    /* parse the scanset from the fmt string */
    *fmtConsumed = uset_applyPattern(scanset, fmt, -1, 0, &status);

    /* verify that the parse was successful */
    if (U_SUCCESS(status)) {
        c=0;

        /* grab characters one at a time and make sure they are in the scanset */
        while(chLeft > 0) {
            if ((isNotEOF = ufile_getch32(input, &c)) && uset_contains(scanset, c)) {
                readCharacter = TRUE;
                if (!info->fSkipArg) {
                    int32_t idx = 0;
                    UBool isError = FALSE;

                    U16_APPEND(alias, idx, chLeft, c, isError);
                    if (isError) {
                        break;
                    }
                    alias += idx;
                }
                chLeft -= (1 + U_IS_SUPPLEMENTARY(c));
            }
            else {
                /* if the character's not in the scanset, break out */
                break;
            }
        }

        /* put the final character we read back on the input */
        if(isNotEOF && chLeft > 0) {
            u_fungetc(c, input);
        }
    }

    uset_close(scanset);

    /* if we didn't match at least 1 character, fail */
    if(!readCharacter)
        return -1;
    /* otherwise, add the terminator */
    else if (!info->fSkipArg) {
        *alias = 0x00;
    }

    /* we converted 1 arg */
    *argConverted = !info->fSkipArg;
    return (info->fWidth >= 0 ? info->fWidth : INT32_MAX) - chLeft;
}
Exemplo n.º 11
0
// ---------------------------------------------------------------------------
//  RangeToken: Getter methods
// ---------------------------------------------------------------------------
RangeToken* RangeToken::getCaseInsensitiveToken(TokenFactory* const tokFactory) {

    if (fCaseIToken == 0 && tokFactory && fRanges) {

        bool isNRange = (getTokenType() == T_NRANGE) ? true : false;
        RangeToken* lwrToken = tokFactory->createRange(isNRange);

#if XERCES_USE_TRANSCODER_ICU && ((U_ICU_VERSION_MAJOR_NUM > 2) || (U_ICU_VERSION_MAJOR_NUM == 2 && U_ICU_VERSION_MINOR_NUM >=4))
        UChar* rangeStr=(UChar*)fMemoryManager->allocate(40*fElemCount*sizeof(UChar));
        ArrayJanitor<UChar> janRange(rangeStr, fMemoryManager);
        int c=0;
        rangeStr[c++] = chOpenSquare;
        for (unsigned int i = 0;  i < fElemCount - 1;  i += 2) {
            XMLCh buffer[10];
            XMLSize_t len, j;

            rangeStr[c++] = chBackSlash;
            rangeStr[c++] = chLatin_U;
            XMLString::binToText(fRanges[i], buffer, 10, 16, fMemoryManager);
            len = XMLString::stringLen(buffer);
            for(j=0;j<(8-len);j++)
                rangeStr[c++] = chDigit_0;
            XMLCh* p=buffer;
            while(*p)
                rangeStr[c++] = *p++;
            if(fRanges[i+1]!=fRanges[i])
            {
                rangeStr[c++] = chDash;
                rangeStr[c++] = chBackSlash;
                rangeStr[c++] = chLatin_U;
                XMLString::binToText(fRanges[i+1], buffer, 10, 16, fMemoryManager);
                len = XMLString::stringLen(buffer);
                for(j=0;j<(8-len);j++)
                    rangeStr[c++] = chDigit_0;
                p=buffer;
                while(*p)
                    rangeStr[c++] = *p++;
            }
        }
        rangeStr[c++] = chCloseSquare;
        rangeStr[c++] = chNull;
        UErrorCode ec=U_ZERO_ERROR;
        USet* range=uset_openPatternOptions(rangeStr, -1, USET_CASE_INSENSITIVE, &ec);
        if(range)
        {
            ec = U_ZERO_ERROR;
            uint32_t cbCount=uset_serialize(range, NULL, 0, &ec);
            uint16_t* buffer=(uint16_t*)fMemoryManager->allocate(cbCount*sizeof(uint16_t));
            ArrayJanitor<uint16_t> janSet(buffer, fMemoryManager);
            ec = U_ZERO_ERROR;
            uset_serialize(range, buffer, cbCount, &ec);
            USerializedSet serializedSet;
            uset_getSerializedSet(&serializedSet, buffer, cbCount);
            int32_t nSets=uset_getSerializedRangeCount(&serializedSet);
            for(int32_t i=0; i<nSets; i++)
            {
                UChar32 start, end;
                uset_getSerializedRange(&serializedSet, i, &start, &end);
                lwrToken->addRange(start, end);
            }
            // does this release the memory allocated by the set?
            uset_setSerializedToOne(&serializedSet, 32);
            uset_close(range);
        }
#else
        unsigned int exceptIndex = 0;

        for (unsigned int i = 0;  i < fElemCount - 1;  i += 2) {
            for (XMLInt32 ch = fRanges[i];  ch <= fRanges[i + 1];  ++ch) {
#if XERCES_USE_TRANSCODER_ICU
                const XMLInt32  upperCh = u_toupper(ch);

                if (upperCh != ch)
                {
                    lwrToken->addRange(upperCh, upperCh);
                }

                const XMLInt32  lowerCh = u_tolower(ch);

                if (lowerCh != ch)
                {
                    lwrToken->addRange(lowerCh, lowerCh);
                }

                const XMLInt32  titleCh = u_totitle(ch);

                if (titleCh != ch && titleCh != upperCh)
                {
                    lwrToken->addRange(titleCh, titleCh);
                }
#else
                if (ch >= chLatin_A && ch <= chLatin_Z)
                {
                    ch += chLatin_a - chLatin_A;

                    lwrToken->addRange(ch, ch);
                }
                else if (ch >= chLatin_a && ch <= chLatin_z)
                {
                    ch -= chLatin_a - chLatin_A;

                    lwrToken->addRange(ch, ch);
                }
#endif

                const unsigned int  exceptionsSize =
                    sizeof(s_exceptions) / sizeof(s_exceptions[0]);

                // Add any exception chars.  These are characters where the the
                // case mapping is not symmetric.  (Unicode case mappings are not isomorphic...)
                while (exceptIndex < exceptionsSize)
                {
                    if (s_exceptions[exceptIndex].baseChar < ch)
                    {
                        ++exceptIndex;
                    }
                    else if (s_exceptions[exceptIndex].baseChar == ch)
                    {
                        const XMLInt32  matchingChar =
                            s_exceptions[exceptIndex].matchingChar;

                        lwrToken->addRange(
                            matchingChar,
                            matchingChar);

                        ++exceptIndex;
                    }
                    else
                    {
                        break;
                    }
                }
            }
        }

        lwrToken->mergeRanges(this);
#endif
        lwrToken->compactRanges();
        lwrToken->createMap();

        fCaseIToken = lwrToken;
        // TODO(dbertoni) This is a temporary hack until we can change the ABI.
        // See Jira issue XERCESC-1866 for more details.
        // Overload the fCaseIToken data member to be the case-insensitive token
        // that's caching the case-insensitive one.  We need this because tokens
        // have varying lifetimes.
        fCaseIToken->setCaseInsensitiveToken(this);
    }

    return fCaseIToken;
}