static void TestCharLength() { static uint32_t codepoint[]={ 1, 0x0061, 1, 0xe065, 1, 0x20ac, 2, 0x20402, 2, 0x23456, 2, 0x24506, 2, 0x20402, 2, 0x10402, 1, 0xd7ff, 1, 0xe000 }; int16_t i; UBool multiple; for(i=0; i<sizeof(codepoint)/sizeof(codepoint[0]); i=(int16_t)(i+2)){ UChar32 c=codepoint[i+1]; if(UTF16_CHAR_LENGTH(c) != (uint16_t)codepoint[i] || U16_LENGTH(c) != (uint16_t)codepoint[i]){ log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], UTF16_CHAR_LENGTH(c)); }else{ log_verbose("The no: of code units for %lx is %d\n",c, UTF16_CHAR_LENGTH(c) ); } multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE); if(UTF16_NEED_MULTIPLE_UCHAR(c) != multiple){ log_err("ERROR: UTF16_NEED_MULTIPLE_UCHAR failed for %lx\n", c); } } }
Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const UChar *segment, int32_t segLen, UErrorCode &status) { if (U_FAILURE(status)) { return NULL; } //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment))); UnicodeString toPut(segment, segLen); fillinResult->put(toPut, new UnicodeString(toPut), status); UnicodeSet starts; // cycle through all the characters UChar32 cp; for (int32_t i = 0; i < segLen; i += UTF16_CHAR_LENGTH(cp)) { // see if any character is at the start of some decomposition UTF_GET_CHAR(segment, 0, i, segLen, cp); if (!nfcImpl.getCanonStartSet(cp, starts)) { continue; } // if so, see which decompositions match UnicodeSetIterator iter(starts); while (iter.next()) { UChar32 cp2 = iter.getCodepoint(); Hashtable remainder(status); remainder.setValueDeleter(uhash_deleteUnicodeString); if (extract(&remainder, cp2, segment, segLen, i, status) == NULL) { continue; } // there were some matches, so add all the possibilities to the set. UnicodeString prefix(segment, i); prefix += cp2; int32_t el = -1; const UHashElement *ne = remainder.nextElement(el); while (ne != NULL) { UnicodeString item = *((UnicodeString *)(ne->value.pointer)); UnicodeString *toAdd = new UnicodeString(prefix); /* test for NULL */ if (toAdd == 0) { status = U_MEMORY_ALLOCATION_ERROR; return NULL; } *toAdd += item; fillinResult->put(*toAdd, toAdd, status); //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd))); ne = remainder.nextElement(el); } } } /* Test for buffer overflows */ if(U_FAILURE(status)) { return NULL; } return fillinResult; }
/** * Dumb recursive implementation of permutation. * TODO: optimize * @param source the string to find permutations for * @return the results in a set. */ void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) { if(U_FAILURE(status)) { return; } //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source))); int32_t i = 0; // optimization: // if zero or one character, just return a set with it // we check for length < 2 to keep from counting code points all the time if (source.length() <= 2 && source.countChar32() <= 1) { UnicodeString *toPut = new UnicodeString(source); /* test for NULL */ if (toPut == 0) { status = U_MEMORY_ALLOCATION_ERROR; return; } result->put(source, toPut, status); return; } // otherwise iterate through the string, and recursively permute all the other characters UChar32 cp; Hashtable subpermute(status); if(U_FAILURE(status)) { return; } subpermute.setValueDeleter(uhash_deleteUnicodeString); for (i = 0; i < source.length(); i += UTF16_CHAR_LENGTH(cp)) { cp = source.char32At(i); const UHashElement *ne = NULL; int32_t el = -1; UnicodeString subPermuteString = source; // optimization: // if the character is canonical combining class zero, // don't permute it if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) { //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); continue; } subpermute.removeAll(); // see what the permutations of the characters before and after this one are //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp))); permute(subPermuteString.replace(i, UTF16_CHAR_LENGTH(cp), NULL, 0), skipZeros, &subpermute, status); /* Test for buffer overflows */ if(U_FAILURE(status)) { return; } // The upper replace is destructive. The question is do we have to make a copy, or we don't care about the contents // of source at this point. // prefix this character to all of them ne = subpermute.nextElement(el); while (ne != NULL) { UnicodeString *permRes = (UnicodeString *)(ne->value.pointer); UnicodeString *chStr = new UnicodeString(cp); //test for NULL if (chStr == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return; } chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer)); //if (PROGRESS) printf(" Piece: %s\n", UToS(*chStr)); result->put(*chStr, chStr, status); ne = subpermute.nextElement(el); } } //return result; }
/** *@param set the source string to iterate against. This allows the same iterator to be used * while changing the source string, saving object creation. */ void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) { int32_t list_length = 0; UChar32 cp = 0; int32_t start = 0; int32_t i = 0; UnicodeString *list = NULL; nfd.normalize(newSource, source, status); if(U_FAILURE(status)) { return; } done = FALSE; cleanPieces(); // catch degenerate case if (newSource.length() == 0) { pieces = (UnicodeString **)uprv_malloc(sizeof(UnicodeString *)); pieces_lengths = (int32_t*)uprv_malloc(1 * sizeof(int32_t)); pieces_length = 1; current = (int32_t*)uprv_malloc(1 * sizeof(int32_t)); current_length = 1; if (pieces == NULL || pieces_lengths == NULL || current == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto CleanPartialInitialization; } current[0] = 0; pieces[0] = new UnicodeString[1]; pieces_lengths[0] = 1; if (pieces[0] == 0) { status = U_MEMORY_ALLOCATION_ERROR; goto CleanPartialInitialization; } return; } list = new UnicodeString[source.length()]; if (list == 0) { status = U_MEMORY_ALLOCATION_ERROR; goto CleanPartialInitialization; } // i should initialy be the number of code units at the // start of the string i = UTF16_CHAR_LENGTH(source.char32At(0)); //int32_t i = 1; // find the segments // This code iterates through the source string and // extracts segments that end up on a codepoint that // doesn't start any decompositions. (Analysis is done // on the NFD form - see above). for (; i < source.length(); i += UTF16_CHAR_LENGTH(cp)) { cp = source.char32At(i); if (nfcImpl.isCanonSegmentStarter(cp)) { source.extract(start, i-start, list[list_length++]); // add up to i start = i; } } source.extract(start, i-start, list[list_length++]); // add last one // allocate the arrays, and find the strings that are CE to each segment pieces = (UnicodeString **)uprv_malloc(list_length * sizeof(UnicodeString *)); pieces_length = list_length; pieces_lengths = (int32_t*)uprv_malloc(list_length * sizeof(int32_t)); current = (int32_t*)uprv_malloc(list_length * sizeof(int32_t)); current_length = list_length; if (pieces == NULL || pieces_lengths == NULL || current == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto CleanPartialInitialization; } for (i = 0; i < current_length; i++) { current[i] = 0; } // for each segment, get all the combinations that can produce // it after NFD normalization for (i = 0; i < pieces_length; ++i) { //if (PROGRESS) printf("SEGMENT\n"); pieces[i] = getEquivalents(list[i], pieces_lengths[i], status); } delete[] list; return; // Common section to cleanup all local variables and reset object variables. CleanPartialInitialization: if (list != NULL) { delete[] list; } cleanPieces(); }
//Tests for new API for utf-16 support void CharIterTest::TestIterationUChar32() { UChar textChars[]={ 0x0061, 0x0062, 0xd841, 0xdc02, 0x20ac, 0xd7ff, 0xd842, 0xdc06, 0xd801, 0xdc00, 0x0061, 0x0000}; UnicodeString text(textChars); UChar32 c; int32_t i; { StringCharacterIterator iter(text, 1); UnicodeString iterText; iter.getText(iterText); if (iterText != text) errln("iter.getText() failed"); if (iter.current32() != text[(int32_t)1]) errln("Iterator didn't start out in the right place."); c=iter.setToStart(); i=0; i=iter.move32(1, CharacterIterator::kStart); c=iter.current32(); if(c != text.char32At(1) || i!=1) errln("move32(1, kStart) didn't work correctly expected %X got %X", c, text.char32At(1) ); i=iter.move32(2, CharacterIterator::kCurrent); c=iter.current32(); if(c != text.char32At(4) || i!=4) errln("move32(2, kCurrent) didn't work correctly expected %X got %X i=%ld", c, text.char32At(4), i); i=iter.move32(-2, CharacterIterator::kCurrent); c=iter.current32(); if(c != text.char32At(1) || i!=1) errln("move32(-2, kCurrent) didn't work correctly expected %X got %X i=%d", c, text.char32At(1), i); i=iter.move32(-2, CharacterIterator::kEnd); c=iter.current32(); if(c != text.char32At((text.length()-3)) || i!=(text.length()-3)) errln("move32(-2, kEnd) didn't work correctly expected %X got %X i=%d", c, text.char32At((text.length()-3)), i); c = iter.first32(); i = 0; if (iter.startIndex() != 0 || iter.endIndex() != text.length()) errln("startIndex() or endIndex() failed"); logln("Testing forward iteration..."); do { /* logln("c=%d i=%d char32At=%d", c, i, text.char32At(i)); */ if (c == CharacterIterator::DONE && i != text.length()) errln("Iterator reached end prematurely"); else if(iter.hasNext() == FALSE && i != text.length()) errln("Iterator reached end prematurely. Failed at hasNext"); else if (c != text.char32At(i)) errln("Character mismatch at position %d, iterator has %X, string has %X", i, c, text.char32At(i)); if (iter.current32() != c) errln("current32() isn't working right"); if(iter.setIndex32(i) != c) errln("setIndex32() isn't working right"); if (c != CharacterIterator::DONE) { c = iter.next32(); i=UTF16_NEED_MULTIPLE_UCHAR(c) ? i+2 : i+1; } } while (c != CharacterIterator::DONE); if(iter.hasNext() == TRUE) errln("hasNext() returned true at the end of the string"); c=iter.setToEnd(); if(iter.getIndex() != text.length() || iter.hasNext() != FALSE) errln("setToEnd failed"); c=iter.next32(); if(c!= CharacterIterator::DONE) errln("next32 didn't return DONE at the end"); c=iter.setIndex32(text.length()+1); if(c!= CharacterIterator::DONE) errln("setIndex32(len+1) didn't return DONE"); c = iter.last32(); i = text.length()-1; logln("Testing backward iteration..."); do { if (c == CharacterIterator::DONE && i >= 0) errln((UnicodeString)"Iterator reached start prematurely for i=" + i); else if(iter.hasPrevious() == FALSE && i>0) errln((UnicodeString)"Iterator reached start prematurely for i=" + i); else if (c != text.char32At(i)) errln("Character mismatch at position %d, iterator has %X, string has %X", i, c, text.char32At(i)); if (iter.current32() != c) errln("current32() isn't working right"); if(iter.setIndex32(i) != c) errln("setIndex32() isn't working right"); if (iter.getIndex() != i) errln("getIndex() isn't working right"); if (c != CharacterIterator::DONE) { c = iter.previous32(); i=UTF16_NEED_MULTIPLE_UCHAR(c) ? i-2 : i-1; } } while (c != CharacterIterator::DONE); if(iter.hasPrevious() == TRUE) errln("hasPrevious returned true after reaching the start"); c=iter.previous32(); if(c!= CharacterIterator::DONE) errln("previous32 didn't return DONE at the beginning"); //testing first32PostInc, next32PostInc, setTostart i = 0; c=iter.first32PostInc(); if(c != text.char32At(i)) errln("first32PostInc failed. Expected->%X Got->%X", text.char32At(i), c); if(iter.getIndex() != UTF16_CHAR_LENGTH(c) + i) errln((UnicodeString)"getIndex() after first32PostInc() failed"); iter.setToStart(); i=0; if (iter.startIndex() != 0) errln("setToStart failed"); logln("Testing forward iteration..."); do { if (c != CharacterIterator::DONE) c = iter.next32PostInc(); if(c != text.char32At(i)) errln("Character mismatch at position %d, iterator has %X, string has %X", i, c, text.char32At(i)); i=UTF16_NEED_MULTIPLE_UCHAR(c) ? i+2 : i+1; if(iter.getIndex() != i) errln("getIndex() aftr next32PostInc() isn't working right"); if(iter.current32() != text.char32At(i)) errln("current() after next32PostInc() isn't working right"); } while (iter.hasNext()); c=iter.next32PostInc(); if(c!= CharacterIterator::DONE) errln("next32PostInc() didn't return DONE at the beginning"); } { StringCharacterIterator iter(text, 1, 11, 10); if (iter.startIndex() != 1 || iter.endIndex() != 11) errln("creation of a restricted-range iterator failed"); if (iter.getIndex() != 10 || iter.current32() != text.char32At(10)) errln("starting the iterator in the middle didn't work"); c = iter.first32(); i = 1; logln("Testing forward iteration over a range..."); do { if (c == CharacterIterator::DONE && i != 11) errln("Iterator reached end prematurely"); else if(iter.hasNext() == FALSE) errln("Iterator reached end prematurely"); else if (c != text.char32At(i)) errln("Character mismatch at position %d, iterator has %X, string has %X", i, c, text.char32At(i)); if (iter.current32() != c) errln("current32() isn't working right"); if(iter.setIndex32(i) != c) errln("setIndex32() isn't working right"); if (c != CharacterIterator::DONE) { c = iter.next32(); i=UTF16_NEED_MULTIPLE_UCHAR(c) ? i+2 : i+1; } } while (c != CharacterIterator::DONE); c=iter.next32(); if(c != CharacterIterator::DONE) errln("error in next32()"); c=iter.last32(); i = 10; logln("Testing backward iteration over a range..."); do { if (c == CharacterIterator::DONE && i >= 5) errln("Iterator reached start prematurely"); else if(iter.hasPrevious() == FALSE && i > 5) errln("Iterator reached start prematurely"); else if (c != text.char32At(i)) errln("Character mismatch at position %d, iterator has %X, string has %X", i, c, text.char32At(i)); if (iter.current32() != c) errln("current32() isn't working right"); if (iter.getIndex() != i) errln("getIndex() isn't working right"); if(iter.setIndex32(i) != c) errln("setIndex32() isn't working right"); if (c != CharacterIterator::DONE) { c = iter.previous32(); i=UTF16_NEED_MULTIPLE_UCHAR(c) ? i-2 : i-1; } } while (c != CharacterIterator::DONE); c=iter.previous32(); if(c!= CharacterIterator::DONE) errln("error on previous32"); } }