Пример #1
0
static void TestCharLength()
{
    static uint32_t codepoint[]={
        1, 0x0061,
        1, 0xe065,
        1, 0x20ac,
        2, 0x20402,
        2, 0x23456,
        2, 0x24506,
        2, 0x20402,
        2, 0x10402,
        1, 0xd7ff,
        1, 0xe000
    };
    
    int16_t i;
    UBool multiple;
    for(i=0; i<sizeof(codepoint)/sizeof(codepoint[0]); i=(int16_t)(i+2)){
        UChar32 c=codepoint[i+1];
        if(UTF16_CHAR_LENGTH(c) != (uint16_t)codepoint[i] || U16_LENGTH(c) != (uint16_t)codepoint[i]){
              log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], UTF16_CHAR_LENGTH(c));
        }else{
              log_verbose("The no: of code units for %lx is %d\n",c, UTF16_CHAR_LENGTH(c) ); 
        }
        multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
        if(UTF16_NEED_MULTIPLE_UCHAR(c) != multiple){
              log_err("ERROR: UTF16_NEED_MULTIPLE_UCHAR failed for %lx\n", c);
        }
    }
}
Пример #2
0
Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const UChar *segment, int32_t segLen, UErrorCode &status) {

    if (U_FAILURE(status)) {
        return NULL;
    }

    //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment)));

    UnicodeString toPut(segment, segLen);

    fillinResult->put(toPut, new UnicodeString(toPut), status);

    UnicodeSet starts;

    // cycle through all the characters
    UChar32 cp;
    for (int32_t i = 0; i < segLen; i += UTF16_CHAR_LENGTH(cp)) {
        // see if any character is at the start of some decomposition
        UTF_GET_CHAR(segment, 0, i, segLen, cp);
        if (!nfcImpl.getCanonStartSet(cp, starts)) {
            continue;
        }
        // if so, see which decompositions match
        UnicodeSetIterator iter(starts);
        while (iter.next()) {
            UChar32 cp2 = iter.getCodepoint();
            Hashtable remainder(status);
            remainder.setValueDeleter(uhash_deleteUnicodeString);
            if (extract(&remainder, cp2, segment, segLen, i, status) == NULL) {
                continue;
            }

            // there were some matches, so add all the possibilities to the set.
            UnicodeString prefix(segment, i);
            prefix += cp2;

            int32_t el = -1;
            const UHashElement *ne = remainder.nextElement(el);
            while (ne != NULL) {
                UnicodeString item = *((UnicodeString *)(ne->value.pointer));
                UnicodeString *toAdd = new UnicodeString(prefix);
                /* test for NULL */
                if (toAdd == 0) {
                    status = U_MEMORY_ALLOCATION_ERROR;
                    return NULL;
                }
                *toAdd += item;
                fillinResult->put(*toAdd, toAdd, status);

                //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));

                ne = remainder.nextElement(el);
            }
        }
    }

    /* Test for buffer overflows */
    if(U_FAILURE(status)) {
        return NULL;
    }
    return fillinResult;
}
Пример #3
0
/**
 * Dumb recursive implementation of permutation.
 * TODO: optimize
 * @param source the string to find permutations for
 * @return the results in a set.
 */
void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) {
    if(U_FAILURE(status)) {
        return;
    }
    //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source)));
    int32_t i = 0;

    // optimization:
    // if zero or one character, just return a set with it
    // we check for length < 2 to keep from counting code points all the time
    if (source.length() <= 2 && source.countChar32() <= 1) {
        UnicodeString *toPut = new UnicodeString(source);
        /* test for NULL */
        if (toPut == 0) {
            status = U_MEMORY_ALLOCATION_ERROR;
            return;
        }
        result->put(source, toPut, status);
        return;
    }

    // otherwise iterate through the string, and recursively permute all the other characters
    UChar32 cp;
    Hashtable subpermute(status);
    if(U_FAILURE(status)) {
        return;
    }
    subpermute.setValueDeleter(uhash_deleteUnicodeString);

    for (i = 0; i < source.length(); i += UTF16_CHAR_LENGTH(cp)) {
        cp = source.char32At(i);
        const UHashElement *ne = NULL;
        int32_t el = -1;
        UnicodeString subPermuteString = source;

        // optimization:
        // if the character is canonical combining class zero,
        // don't permute it
        if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) {
            //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
            continue;
        }

        subpermute.removeAll();

        // see what the permutations of the characters before and after this one are
        //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));
        permute(subPermuteString.replace(i, UTF16_CHAR_LENGTH(cp), NULL, 0), skipZeros, &subpermute, status);
        /* Test for buffer overflows */
        if(U_FAILURE(status)) {
            return;
        }
        // The upper replace is destructive. The question is do we have to make a copy, or we don't care about the contents 
        // of source at this point.

        // prefix this character to all of them
        ne = subpermute.nextElement(el);
        while (ne != NULL) {
            UnicodeString *permRes = (UnicodeString *)(ne->value.pointer);
            UnicodeString *chStr = new UnicodeString(cp);
            //test for  NULL
            if (chStr == NULL) {
                status = U_MEMORY_ALLOCATION_ERROR;
                return;
            }
            chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer));
            //if (PROGRESS) printf("  Piece: %s\n", UToS(*chStr));
            result->put(*chStr, chStr, status);
            ne = subpermute.nextElement(el);
        }
    }
    //return result;
}
Пример #4
0
/**
 *@param set the source string to iterate against. This allows the same iterator to be used
 * while changing the source string, saving object creation.
 */
void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) {
    int32_t list_length = 0;
    UChar32 cp = 0;
    int32_t start = 0;
    int32_t i = 0;
    UnicodeString *list = NULL;

    nfd.normalize(newSource, source, status);
    if(U_FAILURE(status)) {
      return;
    }
    done = FALSE;

    cleanPieces();

    // catch degenerate case
    if (newSource.length() == 0) {
        pieces = (UnicodeString **)uprv_malloc(sizeof(UnicodeString *));
        pieces_lengths = (int32_t*)uprv_malloc(1 * sizeof(int32_t));
        pieces_length = 1;
        current = (int32_t*)uprv_malloc(1 * sizeof(int32_t));
        current_length = 1;
        if (pieces == NULL || pieces_lengths == NULL || current == NULL) {
            status = U_MEMORY_ALLOCATION_ERROR;
            goto CleanPartialInitialization;
        }
        current[0] = 0;
        pieces[0] = new UnicodeString[1];
        pieces_lengths[0] = 1;
        if (pieces[0] == 0) {
            status = U_MEMORY_ALLOCATION_ERROR;
            goto CleanPartialInitialization;
        }
        return;
    }


    list = new UnicodeString[source.length()];
    if (list == 0) {
        status = U_MEMORY_ALLOCATION_ERROR;
        goto CleanPartialInitialization;
    }

    // i should initialy be the number of code units at the 
    // start of the string
    i = UTF16_CHAR_LENGTH(source.char32At(0));
    //int32_t i = 1;
    // find the segments
    // This code iterates through the source string and 
    // extracts segments that end up on a codepoint that
    // doesn't start any decompositions. (Analysis is done
    // on the NFD form - see above).
    for (; i < source.length(); i += UTF16_CHAR_LENGTH(cp)) {
        cp = source.char32At(i);
        if (nfcImpl.isCanonSegmentStarter(cp)) {
            source.extract(start, i-start, list[list_length++]); // add up to i
            start = i;
        }
    }
    source.extract(start, i-start, list[list_length++]); // add last one


    // allocate the arrays, and find the strings that are CE to each segment
    pieces = (UnicodeString **)uprv_malloc(list_length * sizeof(UnicodeString *));
    pieces_length = list_length;
    pieces_lengths = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));
    current = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));
    current_length = list_length;
    if (pieces == NULL || pieces_lengths == NULL || current == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        goto CleanPartialInitialization;
    }

    for (i = 0; i < current_length; i++) {
        current[i] = 0;
    }
    // for each segment, get all the combinations that can produce 
    // it after NFD normalization
    for (i = 0; i < pieces_length; ++i) {
        //if (PROGRESS) printf("SEGMENT\n");
        pieces[i] = getEquivalents(list[i], pieces_lengths[i], status);
    }

    delete[] list;
    return;
// Common section to cleanup all local variables and reset object variables.
CleanPartialInitialization:
    if (list != NULL) {
        delete[] list;
    }
    cleanPieces();
}
Пример #5
0
//Tests for new API for utf-16 support 
void CharIterTest::TestIterationUChar32() {
    UChar textChars[]={ 0x0061, 0x0062, 0xd841, 0xdc02, 0x20ac, 0xd7ff, 0xd842, 0xdc06, 0xd801, 0xdc00, 0x0061, 0x0000};
    UnicodeString text(textChars);
    UChar32 c;
    int32_t i;
    {
        StringCharacterIterator   iter(text, 1);

        UnicodeString iterText;
        iter.getText(iterText);
        if (iterText != text)
          errln("iter.getText() failed");

        if (iter.current32() != text[(int32_t)1])
            errln("Iterator didn't start out in the right place.");

        c=iter.setToStart();
        i=0;
        i=iter.move32(1, CharacterIterator::kStart);
        c=iter.current32();
        if(c != text.char32At(1) || i!=1)
            errln("move32(1, kStart) didn't work correctly expected %X got %X", c, text.char32At(1) );

        i=iter.move32(2, CharacterIterator::kCurrent);
        c=iter.current32();
        if(c != text.char32At(4) || i!=4)
            errln("move32(2, kCurrent) didn't work correctly expected %X got %X i=%ld", c, text.char32At(4), i);
        
        i=iter.move32(-2, CharacterIterator::kCurrent);
        c=iter.current32();
        if(c != text.char32At(1) || i!=1)
            errln("move32(-2, kCurrent) didn't work correctly expected %X got %X i=%d", c, text.char32At(1), i);


        i=iter.move32(-2, CharacterIterator::kEnd);
        c=iter.current32();
        if(c != text.char32At((text.length()-3)) || i!=(text.length()-3))
            errln("move32(-2, kEnd) didn't work correctly expected %X got %X i=%d", c, text.char32At((text.length()-3)), i);
        

        c = iter.first32();
        i = 0;

        if (iter.startIndex() != 0 || iter.endIndex() != text.length())
            errln("startIndex() or endIndex() failed");

        logln("Testing forward iteration...");
        do {
            /* logln("c=%d i=%d char32At=%d", c, i, text.char32At(i)); */
            if (c == CharacterIterator::DONE && i != text.length())
                errln("Iterator reached end prematurely");
            else if(iter.hasNext() == FALSE && i != text.length())
                errln("Iterator reached end prematurely.  Failed at hasNext");
            else if (c != text.char32At(i))
                errln("Character mismatch at position %d, iterator has %X, string has %X", i, c, text.char32At(i));

            if (iter.current32() != c)
                errln("current32() isn't working right");
            if(iter.setIndex32(i) != c)
                errln("setIndex32() isn't working right");
            if (c != CharacterIterator::DONE) {
                c = iter.next32();
                i=UTF16_NEED_MULTIPLE_UCHAR(c) ? i+2 : i+1;
            }
        } while (c != CharacterIterator::DONE);
        if(iter.hasNext() == TRUE)
           errln("hasNext() returned true at the end of the string");



        c=iter.setToEnd();
        if(iter.getIndex() != text.length() || iter.hasNext() != FALSE)
            errln("setToEnd failed");

        c=iter.next32();
        if(c!= CharacterIterator::DONE)
            errln("next32 didn't return DONE at the end");
        c=iter.setIndex32(text.length()+1);
        if(c!= CharacterIterator::DONE)
            errln("setIndex32(len+1) didn't return DONE");


        c = iter.last32();
        i = text.length()-1;
        logln("Testing backward iteration...");
        do {
            if (c == CharacterIterator::DONE && i >= 0)
                errln((UnicodeString)"Iterator reached start prematurely for i=" + i);
            else if(iter.hasPrevious() == FALSE && i>0)
                errln((UnicodeString)"Iterator reached start prematurely for i=" + i);
            else if (c != text.char32At(i))
                errln("Character mismatch at position %d, iterator has %X, string has %X", i, c, text.char32At(i));

            if (iter.current32() != c)
                errln("current32() isn't working right");
            if(iter.setIndex32(i) != c)
                errln("setIndex32() isn't working right");
            if (iter.getIndex() != i)
                errln("getIndex() isn't working right");
            if (c != CharacterIterator::DONE) {
                c = iter.previous32();
                i=UTF16_NEED_MULTIPLE_UCHAR(c) ? i-2 : i-1;
            }
        } while (c != CharacterIterator::DONE);
        if(iter.hasPrevious() == TRUE)
            errln("hasPrevious returned true after reaching the start");

        c=iter.previous32();
        if(c!= CharacterIterator::DONE)
            errln("previous32 didn't return DONE at the beginning");




        //testing first32PostInc, next32PostInc, setTostart
        i = 0;
        c=iter.first32PostInc();
        if(c != text.char32At(i))
            errln("first32PostInc failed.  Expected->%X Got->%X", text.char32At(i), c);
        if(iter.getIndex() != UTF16_CHAR_LENGTH(c) + i)
            errln((UnicodeString)"getIndex() after first32PostInc() failed");

        iter.setToStart();
        i=0;
        if (iter.startIndex() != 0)
            errln("setToStart failed");
       
        logln("Testing forward iteration...");
        do {
            if (c != CharacterIterator::DONE)
                c = iter.next32PostInc();

            if(c != text.char32At(i))
                errln("Character mismatch at position %d, iterator has %X, string has %X", i, c, text.char32At(i));

            i=UTF16_NEED_MULTIPLE_UCHAR(c) ? i+2 : i+1;
            if(iter.getIndex() != i)
                errln("getIndex() aftr next32PostInc() isn't working right");
            if(iter.current32() != text.char32At(i))
                errln("current() after next32PostInc() isn't working right");
        } while (iter.hasNext());
        c=iter.next32PostInc();
        if(c!= CharacterIterator::DONE)
            errln("next32PostInc() didn't return DONE at the beginning");


    }

    {
        StringCharacterIterator iter(text, 1, 11, 10);
        if (iter.startIndex() != 1 || iter.endIndex() != 11)
            errln("creation of a restricted-range iterator failed");

        if (iter.getIndex() != 10 || iter.current32() != text.char32At(10))
            errln("starting the iterator in the middle didn't work");

        c = iter.first32();
       
        i = 1;

        logln("Testing forward iteration over a range...");
        do {
            if (c == CharacterIterator::DONE && i != 11)
                errln("Iterator reached end prematurely");
            else if(iter.hasNext() == FALSE)
                errln("Iterator reached end prematurely");
            else if (c != text.char32At(i))
                errln("Character mismatch at position %d, iterator has %X, string has %X", i, c, text.char32At(i));

            if (iter.current32() != c)
                errln("current32() isn't working right");
            if(iter.setIndex32(i) != c)
                errln("setIndex32() isn't working right");

            if (c != CharacterIterator::DONE) {
                c = iter.next32();
                i=UTF16_NEED_MULTIPLE_UCHAR(c) ? i+2 : i+1;
            }
        } while (c != CharacterIterator::DONE);
        c=iter.next32();
        if(c != CharacterIterator::DONE) 
            errln("error in next32()");


           
        c=iter.last32();
        i = 10;
        logln("Testing backward iteration over a range...");
        do {
            if (c == CharacterIterator::DONE && i >= 5)
                errln("Iterator reached start prematurely");
            else if(iter.hasPrevious() == FALSE && i > 5)
                errln("Iterator reached start prematurely");
            else if (c != text.char32At(i))
                errln("Character mismatch at position %d, iterator has %X, string has %X", i, c, text.char32At(i));
            if (iter.current32() != c)
                errln("current32() isn't working right");
            if (iter.getIndex() != i)
                errln("getIndex() isn't working right");
            if(iter.setIndex32(i) != c)
                errln("setIndex32() isn't working right");

            if (c != CharacterIterator::DONE) {
                c = iter.previous32();
                i=UTF16_NEED_MULTIPLE_UCHAR(c) ? i-2 : i-1;
            }
           
        } while (c != CharacterIterator::DONE);
        c=iter.previous32();
        if(c!= CharacterIterator::DONE)
            errln("error on previous32");

                
    }
}