Esempio n. 1
0
static UBool *
getResultsManually(const char** encodings, int32_t num_encodings,
                   const char *utf8, int32_t length,
                   const USet* excludedCodePoints, const UConverterUnicodeSet whichSet) {
  UBool* resultsManually;
  int32_t i;

  resultsManually = (UBool*) uprv_malloc(gCountAvailable);
  uprv_memset(resultsManually, 0, gCountAvailable);

  for(i = 0 ; i < num_encodings ; i++) {
    UErrorCode status = U_ZERO_ERROR;
    /* get unicode set for that converter */
    USet* set;
    UConverter* test_converter;
    UChar32 cp;
    int32_t encIndex, offset;

    set = uset_openEmpty();
    test_converter = ucnv_open(encodings[i], &status);
    ucnv_getUnicodeSet(test_converter, set,
                       whichSet, &status);
    if (excludedCodePoints != NULL) {
      uset_addAll(set, excludedCodePoints);
    }
    uset_freeze(set);
    offset = 0;
    cp = 0;

    encIndex = findIndex(encodings[i]);
    /*
     * The following is almost, but not entirely, the same as
     * resultsManually[encIndex] =
     *   (UBool)(uset_spanUTF8(set, utf8, length, USET_SPAN_SIMPLE) == length);
     * They might be different if the set contains strings,
     * or if the utf8 string contains an illegal sequence.
     *
     * The UConverterSelector does not currently handle strings that can be
     * converted, and it treats an illegal sequence as convertible
     * while uset_spanUTF8() treats it like U+FFFD which may not be convertible.
     */
    resultsManually[encIndex] = TRUE;
    while(offset<length) {
      U8_NEXT(utf8, offset, length, cp);
      if (cp >= 0 && !uset_contains(set, cp)) {
        resultsManually[encIndex] = FALSE;
        break;
      }
    }
    uset_close(set);
    ucnv_close(test_converter);
  }
  return resultsManually;
}
Esempio n. 2
0
File: utr.c Progetto: julp/ugrep
// pattern (ustr) is not expected to be a class here (no square brackets here)
USet *create_set_from_ustring(UString *ustr, UBool negate, error_t **UNUSED(error))
{
    USet *uset;

    uset = uset_openEmpty();
    uset_addAllCodePoints(uset, ustr->ptr, ustr->len);
    if (negate) {
        uset_complement(uset);
    }
    uset_freeze(uset);

    return uset;
}
Esempio n. 3
0
U_CAPI USet* U_EXPORT2
utrans_getSourceSet(const UTransliterator* trans,
                    UBool ignoreFilter,
                    USet* fillIn,
                    UErrorCode* status) {
    utrans_ENTRY(status) fillIn;

    if (fillIn == NULL) {
        fillIn = uset_openEmpty();
    }
    if (ignoreFilter) {
        ((Transliterator*) trans)->handleGetSourceSet(*((UnicodeSet*)fillIn));
    } else {
        ((Transliterator*) trans)->getSourceSet(*((UnicodeSet*)fillIn));
    }
    return fillIn;
}
Esempio n. 4
0
void SSearchTest::monkeyTest(char *params)
{
    // ook!
    UErrorCode status = U_ZERO_ERROR;
  //UCollator *coll = ucol_open(NULL, &status);
    UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);

    if (U_FAILURE(status)) {
        errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
        return;
    }

    CollData  *monkeyData = new CollData(coll, status);

    USet *expansions   = uset_openEmpty();
    USet *contractions = uset_openEmpty();

    ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);

    U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
    U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
    USet *letters = uset_openPattern(letter_pattern, 39, &status);
    SetMonkey letterMonkey(letters);
    StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
    StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
    UnicodeString testCase;
    UnicodeString alternate;
    UnicodeString pattern, altPattern;
    UnicodeString prefix, altPrefix;
    UnicodeString suffix, altSuffix;

    Monkey *monkeys[] = {
        &letterMonkey,
        &contractionMonkey,
        &expansionMonkey,
        &contractionMonkey,
        &expansionMonkey,
        &contractionMonkey,
        &expansionMonkey,
        &contractionMonkey,
        &expansionMonkey};
    int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
    // int32_t nonMatchCount = 0;

    UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
    const char *strengthNames[] = {"primary", "secondary", "tertiary"};
    int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
    int32_t loopCount = quick? 1000 : 10000;
    int32_t firstStrength = 0;
    int32_t lastStrength  = strengthCount - 1; //*/ 0;

    if (params != NULL) {
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
        UnicodeString p(params);

        loopCount = getIntParam("loop", p, loopCount);
        m_seed    = getIntParam("seed", p, m_seed);

        RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
        if (m.find()) {
            UnicodeString breakType = m.group(1, status);

            for (int32_t s = 0; s < strengthCount; s += 1) {
                if (breakType == strengthNames[s]) {
                    firstStrength = lastStrength = s;
                    break;
                }
            }

            m.reset();
            p = m.replaceFirst("", status);
        }

        if (RegexMatcher("\\S", p, 0, status).find()) {
            // Each option is stripped out of the option string as it is processed.
            // All options have been checked.  The option string should have been completely emptied..
            char buf[100];
            p.extract(buf, sizeof(buf), NULL, status);
            buf[sizeof(buf)-1] = 0;
            errln("Unrecognized or extra parameter:  %s\n", buf);
            return;
        }
#else
        infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
#endif
    }

    for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
        int32_t notFoundCount = 0;

        logln("Setting strength to %s.", strengthNames[s]);
        ucol_setStrength(coll, strengths[s]);

        // TODO: try alternate prefix and suffix too?
        // TODO: alterntaes are only equal at primary strength. Is this OK?
        for(int32_t t = 0; t < loopCount; t += 1) {
            uint32_t seed = m_seed;
            // int32_t  nmc = 0;

            generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
            generateTestCase(coll, monkeys, monkeyCount, prefix,  altPrefix);
            generateTestCase(coll, monkeys, monkeyCount, suffix,  altSuffix);

            // pattern
            notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed);

            testCase.remove();
            testCase.append(prefix);
            testCase.append(/*alt*/pattern);

            // prefix + pattern
            notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed);

            testCase.append(suffix);

            // prefix + pattern + suffix
            notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed);

            testCase.remove();
            testCase.append(pattern);
            testCase.append(suffix);

            // pattern + suffix
            notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
        }

       logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
    }

    uset_close(contractions);
    uset_close(expansions);
    uset_close(letters);
    delete monkeyData;

    ucol_close(coll);
}
Esempio n. 5
0
CollData::CollData(UCollator *collator, UErrorCode &status)
    : coll(NULL), ceToCharsStartingWith(NULL)
{
    // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
    // i.e. other, control, private use, format, surrogate
    U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
    U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
    USet *charsToTest = uset_openPattern(test_pattern, 20, &status);

    // Han ext. A, Han, Jamo, Hangul, Han Ext. B
    // i.e. all the characers we handle implicitly
    U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
    U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
    USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);

    if (U_FAILURE(status)) {
        return;
    }

    USet *expansions   = uset_openEmpty();
    USet *contractions = uset_openEmpty();
    int32_t itemCount;

    ceToCharsStartingWith = new CEToStringsMap(status);

    if (U_FAILURE(status)) {
        goto bail;
    }

#ifdef CLONE_COLLATOR
    coll = ucol_safeClone(collator, NULL, NULL, &status);

    if (U_FAILURE(status)) {
        goto bail;
    }
#else
    coll = collator;
#endif

    ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);

    uset_addAll(charsToTest, contractions);
    uset_addAll(charsToTest, expansions);
    uset_removeAll(charsToTest, charsToRemove);

    itemCount = uset_getItemCount(charsToTest);
    for(int32_t item = 0; item < itemCount; item += 1) {
        UChar32 start = 0, end = 0;
        UChar buffer[16];
        int32_t len = uset_getItem(charsToTest, item, &start, &end,
                                   buffer, 16, &status);

        if (len == 0) {
            for (UChar32 ch = start; ch <= end; ch += 1) {
                UnicodeString *st = new UnicodeString(ch);

                if (st == NULL) {
                    status = U_MEMORY_ALLOCATION_ERROR;
                    break;
                }

                CEList *ceList = new CEList(coll, *st, status);

                ceToCharsStartingWith->put(ceList->get(0), st, status);

                delete ceList;
                delete st;
            }
        } else if (len > 0) {
            UnicodeString *st = new UnicodeString(buffer, len);

            if (st == NULL) {
                status = U_MEMORY_ALLOCATION_ERROR;
                break;
            }

            CEList *ceList = new CEList(coll, *st, status);

            ceToCharsStartingWith->put(ceList->get(0), st, status);

            delete ceList;
            delete st;
        } else {
            // shouldn't happen...
        }

        if (U_FAILURE(status)) {
             break;
        }
    }

bail:
    uset_close(contractions);
    uset_close(expansions);
    uset_close(charsToRemove);
    uset_close(charsToTest);

    if (U_FAILURE(status)) {
        return;
    }

    UnicodeSet hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status);
    if (U_FAILURE(status)) {
        return;
    }
    UnicodeSetIterator hanIter(hanRanges);
    UnicodeString hanString;
    while(hanIter.nextRange()) {
        hanString.append(hanIter.getCodepoint());
        hanString.append(hanIter.getCodepointEnd());
    }
    // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9,
    // but as of Unicode 6.3 the 11xx block is filled,
    // and there are also more Jamo T at U+D7CB..U+D7FB.
    // Maybe use [:HST=T:] and look for the end of the last range?
    // Maybe use script boundary mappings instead of this code??
    UChar  jamoRanges[] = {Hangul::JAMO_L_BASE, Hangul::JAMO_V_BASE, Hangul::JAMO_T_BASE + 1, 0x11FF};
     UnicodeString jamoString(FALSE, jamoRanges, UPRV_LENGTHOF(jamoRanges));
     CEList hanList(coll, hanString, status);
     CEList jamoList(coll, jamoString, status);
     int32_t j = 0;

     if (U_FAILURE(status)) {
         return;
     }

     for (int32_t c = 0; c < jamoList.size(); c += 1) {
         uint32_t jce = jamoList[c];

         if (! isContinuation(jce)) {
             jamoLimits[j++] = jce;
         }
     }

     jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);

     minHan = 0xFFFFFFFF;
     maxHan = 0;

     for(int32_t h = 0; h < hanList.size(); h += 2) {
         uint32_t han = (uint32_t) hanList[h];

         if (han < minHan) {
             minHan = han;
         }

         if (han > maxHan) {
             maxHan = han;
         }
     }

     maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
}
Esempio n. 6
0
static void TestSelector()
{
  TestText text;
  USet* excluded_sets[3] = { NULL };
  int32_t i, testCaseIdx;

  if (!getAvailableNames()) {
    return;
  }
  if (!text_open(&text)) {
    releaseAvailableNames();;
  }

  excluded_sets[0] = uset_openEmpty();
  for(i = 1 ; i < 3 ; i++) {
    excluded_sets[i] = uset_open(i*30, i*30+500);
  }

  for(testCaseIdx = 0; testCaseIdx < UPRV_LENGTHOF(getEncodingsFns); testCaseIdx++)
  {
    int32_t excluded_set_id;
    int32_t num_encodings;
    const char **encodings = getEncodingsFns[testCaseIdx](&num_encodings);
    if (getTestOption(QUICK_OPTION) && num_encodings > 25) {
      uprv_free((void *)encodings);
      continue;
    }

    /*
     * for(excluded_set_id = 0 ; excluded_set_id < 3 ; excluded_set_id++)
     *
     * This loop was replaced by the following statement because
     * the loop made the test run longer without adding to the code coverage.
     * The handling of the exclusion set is independent of the
     * set of encodings, so there is no need to test every combination.
     */
    excluded_set_id = testCaseIdx % UPRV_LENGTHOF(excluded_sets);
    {
      UConverterSelector *sel_rt, *sel_fb;
      char *buffer_fb = NULL;
      UErrorCode status = U_ZERO_ERROR;
      sel_rt = ucnvsel_open(encodings, num_encodings,
                            excluded_sets[excluded_set_id],
                            UCNV_ROUNDTRIP_SET, &status);
      if (num_encodings == gCountAvailable) {
        /* test the special "all converters" parameter values */
        sel_fb = ucnvsel_open(NULL, 0,
                              excluded_sets[excluded_set_id],
                              UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status);
      } else if (uset_isEmpty(excluded_sets[excluded_set_id])) {
        /* test that a NULL set gives the same results as an empty set */
        sel_fb = ucnvsel_open(encodings, num_encodings,
                              NULL,
                              UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status);
      } else {
        sel_fb = ucnvsel_open(encodings, num_encodings,
                              excluded_sets[excluded_set_id],
                              UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status);
      }
      if (U_FAILURE(status)) {
        log_err("ucnv_sel_open(encodings %ld) failed - %s\n", testCaseIdx, u_errorName(status));
        ucnvsel_close(sel_rt);
        uprv_free((void *)encodings);
        continue;
      }

      text_reset(&text);
      for (;;) {
        UBool *manual_rt, *manual_fb;
        static UChar utf16[10000];
        char *s;
        int32_t length8, length16;

        s = text_nextString(&text, &length8);
        if (s == NULL || (getTestOption(QUICK_OPTION) && text.number > 3)) {
          break;
        }

        manual_rt = getResultsManually(encodings, num_encodings,
                                       s, length8,
                                       excluded_sets[excluded_set_id],
                                       UCNV_ROUNDTRIP_SET);
        manual_fb = getResultsManually(encodings, num_encodings,
                                       s, length8,
                                       excluded_sets[excluded_set_id],
                                       UCNV_ROUNDTRIP_AND_FALLBACK_SET);
        /* UTF-8 with length */
        status = U_ZERO_ERROR;
        verifyResult(ucnvsel_selectForUTF8(sel_rt, s, length8, &status), manual_rt);
        verifyResult(ucnvsel_selectForUTF8(sel_fb, s, length8, &status), manual_fb);
        /* UTF-8 NUL-terminated */
        verifyResult(ucnvsel_selectForUTF8(sel_rt, s, -1, &status), manual_rt);
        verifyResult(ucnvsel_selectForUTF8(sel_fb, s, -1, &status), manual_fb);

        u_strFromUTF8(utf16, UPRV_LENGTHOF(utf16), &length16, s, length8, &status);
        if (U_FAILURE(status)) {
          log_err("error converting the test text (string %ld) to UTF-16 - %s\n",
                  (long)text.number, u_errorName(status));
        } else {
          if (text.number == 0) {
            sel_fb = serializeAndUnserialize(sel_fb, &buffer_fb, &status);
          }
          if (U_SUCCESS(status)) {
            /* UTF-16 with length */
            verifyResult(ucnvsel_selectForString(sel_rt, utf16, length16, &status), manual_rt);
            verifyResult(ucnvsel_selectForString(sel_fb, utf16, length16, &status), manual_fb);
            /* UTF-16 NUL-terminated */
            verifyResult(ucnvsel_selectForString(sel_rt, utf16, -1, &status), manual_rt);
            verifyResult(ucnvsel_selectForString(sel_fb, utf16, -1, &status), manual_fb);
          }
        }

        uprv_free(manual_rt);
        uprv_free(manual_fb);
      }
      ucnvsel_close(sel_rt);
      ucnvsel_close(sel_fb);
      uprv_free(buffer_fb);
    }
    uprv_free((void *)encodings);
  }

  releaseAvailableNames();
  text_close(&text);
  for(i = 0 ; i < 3 ; i++) {
    uset_close(excluded_sets[i]);
  }
}
Esempio n. 7
0
/**
 * Basic API test for uset.x
 */
static void TestAPI() {
    USet* set;
    USet* set2;
    UErrorCode ec;
    
    /* [] */
    set = uset_openEmpty();
    expect(set, "", "abc{ab}", NULL);
    uset_close(set);

    set = uset_open(1, 0);
    expect(set, "", "abc{ab}", NULL);
    uset_close(set);

    set = uset_open(1, 1);
    uset_clear(set);
    expect(set, "", "abc{ab}", NULL);
    uset_close(set);

    /* [ABC] */
    set = uset_open(0x0041, 0x0043);
    expect(set, "ABC", "DEF{ab}", NULL);
    uset_close(set);

    /* [a-c{ab}] */
    ec = U_ZERO_ERROR;
    set = uset_openPattern(PAT, PAT_LEN, &ec);
    if(U_FAILURE(ec)) {
        log_err("uset_openPattern([a-c{ab}]) failed - %s\n", u_errorName(ec));
        return;
    }
    if(!uset_resemblesPattern(PAT, PAT_LEN, 0)) {
        log_err("uset_resemblesPattern of PAT failed\n");
    }
    expect(set, "abc{ab}", "def{bc}", &ec);

    /* [a-d{ab}] */
    uset_add(set, 0x64);
    expect(set, "abcd{ab}", "ef{bc}", NULL);

    /* [acd{ab}{bc}] */
    uset_remove(set, 0x62);
    uset_addString(set, STR_bc, STR_bc_LEN);
    expect(set, "acd{ab}{bc}", "bef{cd}", NULL);

    /* [acd{bc}] */
    uset_removeString(set, STR_ab, STR_ab_LEN);
    expect(set, "acd{bc}", "bfg{ab}", NULL);

    /* [^acd{bc}] */
    uset_complement(set);
    expect(set, "bef{bc}", "acd{ac}", NULL);

    /* [a-e{bc}] */
    uset_complement(set);
    uset_addRange(set, 0x0062, 0x0065);
    expect(set, "abcde{bc}", "fg{ab}", NULL);

    /* [de{bc}] */
    uset_removeRange(set, 0x0050, 0x0063);
    expect(set, "de{bc}", "bcfg{ab}", NULL);

    /* [g-l] */
    uset_set(set, 0x0067, 0x006C);
    expect(set, "ghijkl", "de{bc}", NULL);

    if (uset_indexOf(set, 0x0067) != 0) {
        log_err("uset_indexOf failed finding correct index of 'g'\n");
    }

    if (uset_charAt(set, 0) != 0x0067) {
        log_err("uset_charAt failed finding correct char 'g' at index 0\n");
    }

    /* How to test this one...? */
    uset_compact(set);

    /* [g-i] */
    uset_retain(set, 0x0067, 0x0069);
    expect(set, "ghi", "dejkl{bc}", NULL);

    /* UCHAR_ASCII_HEX_DIGIT */
    uset_applyIntPropertyValue(set, UCHAR_ASCII_HEX_DIGIT, 1, &ec);
    if(U_FAILURE(ec)) {
        log_err("uset_applyIntPropertyValue([UCHAR_ASCII_HEX_DIGIT]) failed - %s\n", u_errorName(ec));
        return;
    }
    expect(set, "0123456789ABCDEFabcdef", "GHIjkl{bc}", NULL);

    /* [ab] */
    uset_clear(set);
    uset_addAllCodePoints(set, STR_ab, STR_ab_LEN);
    expect(set, "ab", "def{ab}", NULL);
    if (uset_containsAllCodePoints(set, STR_bc, STR_bc_LEN)){
        log_err("set should not conatin all characters of \"bc\" \n");
    }

    /* [] */
    set2 = uset_open(1, 1);
    uset_clear(set2);

    /* space */
    uset_applyPropertyAlias(set2, PAT_lb, PAT_lb_LEN, VAL_SP, VAL_SP_LEN, &ec);
    expect(set2, " ", "abcdefghi{bc}", NULL);

    /* [a-c] */
    uset_set(set2, 0x0061, 0x0063);
    /* [g-i] */
    uset_set(set, 0x0067, 0x0069);

    /* [a-c g-i] */
    if (uset_containsSome(set, set2)) {
        log_err("set should not contain some of set2 yet\n");
    }
    uset_complementAll(set, set2);
    if (!uset_containsSome(set, set2)) {
        log_err("set should contain some of set2\n");
    }
    expect(set, "abcghi", "def{bc}", NULL);

    /* [g-i] */
    uset_removeAll(set, set2);
    expect(set, "ghi", "abcdef{bc}", NULL);

    /* [a-c g-i] */
    uset_addAll(set2, set);
    expect(set2, "abcghi", "def{bc}", NULL);

    /* [g-i] */
    uset_retainAll(set2, set);
    expect(set2, "ghi", "abcdef{bc}", NULL);

    uset_close(set);
    uset_close(set2);
}