U_NAMESPACE_BEGIN CStr::CStr(const UnicodeString &in) { UErrorCode status = U_ZERO_ERROR; #if !UCONFIG_NO_CONVERSION || U_CHARSET_IS_UTF8 int32_t length = in.extract(0, in.length(), static_cast<char *>(NULL), static_cast<uint32_t>(0)); int32_t resultCapacity = 0; char *buf = s.getAppendBuffer(length, length, resultCapacity, status); if (U_SUCCESS(status)) { in.extract(0, in.length(), buf, resultCapacity); s.append(buf, length, status); } #else // No conversion available. Convert any invariant characters; substitute '?' for the rest. // Note: can't just call u_UCharsToChars() or CharString.appendInvariantChars() on the // whole string because they require that the entire input be invariant. char buf[2]; for (int i=0; i<in.length(); i = in.moveIndex32(i, 1)) { if (uprv_isInvariantUString(in.getBuffer()+i, 1)) { u_UCharsToChars(in.getBuffer()+i, buf, 1); } else { buf[0] = '?'; } s.append(buf, 1, status); } #endif }
void AccumulativeWordCounter::operator+=(const UnicodeString& ustr) { for(int32_t i=0; i<ustr.length(); i=ustr.moveIndex32(i,1)) { this->operator+=(ustr.char32At(i)); } }
bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const { if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) { return true; } UnicodeString skelStr; fSpoofData->confusableLookup(cp, skelStr); UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1)); if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) { return true; } return false; }
//------------------------------------------------------------------------------ // // stripRules Return a rules string without extra spaces. // (Comments are removed separately, during rule parsing.) // //------------------------------------------------------------------------------ UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) { UnicodeString strippedRules; int32_t rulesLength = rules.length(); bool skippingSpaces = false; for (int32_t idx=0; idx<rulesLength; idx = rules.moveIndex32(idx, 1)) { UChar32 cp = rules.char32At(idx); bool whiteSpace = u_hasBinaryProperty(cp, UCHAR_PATTERN_WHITE_SPACE); if (skippingSpaces && whiteSpace) { continue; } strippedRules.append(cp); skippingSpaces = whiteSpace; } return strippedRules; }
UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { UnicodeString result; if (item.length() == 0) { return result; } int32_t i = 0; for (;;) { UChar32 cp = item.char32At(i); result.append(cp); i = item.moveIndex32(i, 1); if (i >= item.length()) { break; } result.append(CGJ); } return result; }
// testConfData - Check each data item from the Unicode confusables.txt file, // verify that it transforms correctly in a skeleton. // void IntlTestSpoof::testConfData() { UErrorCode status = U_ZERO_ERROR; const char *testDataDir = IntlTest::getSourceTestData(status); TEST_ASSERT_SUCCESS(status); char buffer[2000]; uprv_strcpy(buffer, testDataDir); uprv_strcat(buffer, "confusables.txt"); LocalStdioFilePointer f(fopen(buffer, "rb")); if (f.isNull()) { errln("Skipping test spoof/testConfData. File confusables.txt not accessible."); return; } fseek(f.getAlias(), 0, SEEK_END); int32_t fileSize = ftell(f.getAlias()); LocalArray<char> fileBuf(new char[fileSize]); fseek(f.getAlias(), 0, SEEK_SET); int32_t amt_read = fread(fileBuf.getAlias(), 1, fileSize, f.getAlias()); TEST_ASSERT_EQ(amt_read, fileSize); TEST_ASSERT(fileSize>0); if (amt_read != fileSize || fileSize <=0) { return; } UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf.getAlias(), fileSize)); LocalUSpoofCheckerPointer sc(uspoof_open(&status)); TEST_ASSERT_SUCCESS(status); // Parse lines from the confusables.txt file. Example Line: // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH .... // Three fields. The hex fields can contain more than one character, // and each character may be more than 4 digits (for supplemntals) // This regular expression matches lines and splits the fields into capture groups. RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt, 0, status); TEST_ASSERT_SUCCESS(status); while (parseLine.find()) { UnicodeString from = parseHex(parseLine.group(1, status)); if (!Normalizer::isNormalized(from, UNORM_NFD, status)) { // The source character was not NFD. // Skip this case; the first step in obtaining a skeleton is to NFD the input, // so the mapping in this line of confusables.txt will never be applied. continue; } UnicodeString rawExpected = parseHex(parseLine.group(2, status)); UnicodeString expected; Normalizer::decompose(rawExpected, FALSE /*NFD*/, 0, expected, status); TEST_ASSERT_SUCCESS(status); int32_t skeletonType = 0; UnicodeString tableType = parseLine.group(3, status); TEST_ASSERT_SUCCESS(status); if (tableType.indexOf("SL") >= 0) { skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE; } else if (tableType.indexOf("SA") >= 0) { skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE; } else if (tableType.indexOf("ML") >= 0) { skeletonType = 0; } else if (tableType.indexOf("MA") >= 0) { skeletonType = USPOOF_ANY_CASE; } UnicodeString actual; uspoof_getSkeletonUnicodeString(sc.getAlias(), skeletonType, from, actual, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(actual == expected); if (actual != expected) { errln(parseLine.group(0, status)); UnicodeString line = "Actual: "; int i = 0; while (i < actual.length()) { appendHexUChar(line, actual.char32At(i)); i = actual.moveIndex32(i, 1); } errln(line); } if (U_FAILURE(status)) { break; } } }
void AlphabeticIndex::buildIndex(UErrorCode &status) { if (U_FAILURE(status)) { return; } if (!indexBuildRequired_) { return; } // Discard any already-built data. // This is important when the user builds and uses an index, then subsequently modifies it, // necessitating a rebuild. bucketList_->removeAllElements(); labels_->removeAllElements(); uhash_removeAll(alreadyIn_); noDistinctSorting_->clear(); notAlphabetic_->clear(); // first sort the incoming Labels, with a "best" ordering among items // that are the same according to the collator UVector preferenceSorting(status); // Vector of UnicodeStrings; owned by the vector. preferenceSorting.setDeleter(uprv_deleteUObject); appendUnicodeSetToUVector(preferenceSorting, *initialLabels_, status); preferenceSorting.sortWithUComparator(PreferenceComparator, &status, status); // We now make a set of Labels. // Some of the input may, however, be redundant. // That is, we might have c, ch, d, where "ch" sorts just like "c", "h" // So we make a pass through, filtering out those cases. // TODO: filtering these out would seem to be at odds with the eventual goal // of being able to split buckets that contain too many items. UnicodeSet labelSet; for (int32_t psIndex=0; psIndex<preferenceSorting.size(); psIndex++) { UnicodeString item = *static_cast<const UnicodeString *>(preferenceSorting.elementAt(psIndex)); // TODO: Since preferenceSorting was originally populated from the contents of a UnicodeSet, // is it even possible for duplicates to show up in this check? if (labelSet.contains(item)) { UnicodeSetIterator itemAlreadyInIter(labelSet); while (itemAlreadyInIter.next()) { const UnicodeString &itemAlreadyIn = itemAlreadyInIter.getString(); if (collatorPrimaryOnly_->compare(item, itemAlreadyIn) == 0) { UnicodeSet *targets = static_cast<UnicodeSet *>(uhash_get(alreadyIn_, &itemAlreadyIn)); if (targets == NULL) { // alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet<String>()); targets = new UnicodeSet(); uhash_put(alreadyIn_, itemAlreadyIn.clone(), targets, &status); } targets->add(item); break; } } } else if (item.moveIndex32(0, 1) < item.length() && // Label contains more than one code point. collatorPrimaryOnly_->compare(item, separated(item)) == 0) { noDistinctSorting_->add(item); } else if (!ALPHABETIC->containsSome(item)) { notAlphabetic_->add(item); } else { labelSet.add(item); } } // If we have no labels, hard-code a fallback default set of [A-Z] // This case can occur with locales that don't have exemplar character data, including root. // A no-labels situation will cause other problems; it needs to be avoided. if (labelSet.isEmpty()) { labelSet.add((UChar32)0x41, (UChar32)0x5A); } // Move the set of Labels from the set into a vector, and sort // according to the collator. appendUnicodeSetToUVector(*labels_, labelSet, status); labels_->sortWithUComparator(sortCollateComparator, collatorPrimaryOnly_, status); // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element // Implemented by copying the elements to be retained to a new UVector. const int32_t size = labelSet.size() - 1; if (size > maxLabelCount_) { UVector *newLabels = new UVector(status); newLabels->setDeleter(uprv_deleteUObject); int32_t count = 0; int32_t old = -1; for (int32_t srcIndex=0; srcIndex<labels_->size(); srcIndex++) { const UnicodeString *str = static_cast<const UnicodeString *>(labels_->elementAt(srcIndex)); ++count; const int32_t bump = count * maxLabelCount_ / size; if (bump == old) { // it.remove(); } else { newLabels->addElement(str->clone(), status); old = bump; } } delete labels_; labels_ = newLabels; } // We now know the list of labels. // Create a corresponding list of buckets, one per label. buildBucketList(status); // Corresponds to Java BucketList constructor. // Bin the Records into the Buckets. bucketRecords(status); indexBuildRequired_ = FALSE; resetBucketIterator(status); }