void AlphabeticIndex::buildIndex(UErrorCode &status) { if (U_FAILURE(status)) { return; } if (!indexBuildRequired_) { return; } // Discard any already-built data. // This is important when the user builds and uses an index, then subsequently modifies it, // necessitating a rebuild. bucketList_->removeAllElements(); labels_->removeAllElements(); uhash_removeAll(alreadyIn_); noDistinctSorting_->clear(); notAlphabetic_->clear(); // first sort the incoming Labels, with a "best" ordering among items // that are the same according to the collator UVector preferenceSorting(status); // Vector of UnicodeStrings; owned by the vector. preferenceSorting.setDeleter(uprv_deleteUObject); appendUnicodeSetToUVector(preferenceSorting, *initialLabels_, status); preferenceSorting.sortWithUComparator(PreferenceComparator, &status, status); // We now make a set of Labels. // Some of the input may, however, be redundant. // That is, we might have c, ch, d, where "ch" sorts just like "c", "h" // So we make a pass through, filtering out those cases. // TODO: filtering these out would seem to be at odds with the eventual goal // of being able to split buckets that contain too many items. UnicodeSet labelSet; for (int32_t psIndex=0; psIndex<preferenceSorting.size(); psIndex++) { UnicodeString item = *static_cast<const UnicodeString *>(preferenceSorting.elementAt(psIndex)); // TODO: Since preferenceSorting was originally populated from the contents of a UnicodeSet, // is it even possible for duplicates to show up in this check? if (labelSet.contains(item)) { UnicodeSetIterator itemAlreadyInIter(labelSet); while (itemAlreadyInIter.next()) { const UnicodeString &itemAlreadyIn = itemAlreadyInIter.getString(); if (collatorPrimaryOnly_->compare(item, itemAlreadyIn) == 0) { UnicodeSet *targets = static_cast<UnicodeSet *>(uhash_get(alreadyIn_, &itemAlreadyIn)); if (targets == NULL) { // alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet<String>()); targets = new UnicodeSet(); uhash_put(alreadyIn_, itemAlreadyIn.clone(), targets, &status); } targets->add(item); break; } } } else if (item.moveIndex32(0, 1) < item.length() && // Label contains more than one code point. collatorPrimaryOnly_->compare(item, separated(item)) == 0) { noDistinctSorting_->add(item); } else if (!ALPHABETIC->containsSome(item)) { notAlphabetic_->add(item); } else { labelSet.add(item); } } // If we have no labels, hard-code a fallback default set of [A-Z] // This case can occur with locales that don't have exemplar character data, including root. // A no-labels situation will cause other problems; it needs to be avoided. if (labelSet.isEmpty()) { labelSet.add((UChar32)0x41, (UChar32)0x5A); } // Move the set of Labels from the set into a vector, and sort // according to the collator. appendUnicodeSetToUVector(*labels_, labelSet, status); labels_->sortWithUComparator(sortCollateComparator, collatorPrimaryOnly_, status); // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element // Implemented by copying the elements to be retained to a new UVector. const int32_t size = labelSet.size() - 1; if (size > maxLabelCount_) { UVector *newLabels = new UVector(status); newLabels->setDeleter(uprv_deleteUObject); int32_t count = 0; int32_t old = -1; for (int32_t srcIndex=0; srcIndex<labels_->size(); srcIndex++) { const UnicodeString *str = static_cast<const UnicodeString *>(labels_->elementAt(srcIndex)); ++count; const int32_t bump = count * maxLabelCount_ / size; if (bump == old) { // it.remove(); } else { newLabels->addElement(str->clone(), status); old = bump; } } delete labels_; labels_ = newLabels; } // We now know the list of labels. // Create a corresponding list of buckets, one per label. buildBucketList(status); // Corresponds to Java BucketList constructor. // Bin the Records into the Buckets. bucketRecords(status); indexBuildRequired_ = FALSE; resetBucketIterator(status); }
void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const { const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode); if (U_FAILURE(errorCode)) { return; } const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0); const UnicodeString &overflowBoundary = *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1); // We make a sorted array of elements. // Some of the input may be redundant. // That is, we might have c, ch, d, where "ch" sorts just like "c", "h". // We filter out those cases. UnicodeSetIterator iter(*initialLabels_); while (iter.next()) { const UnicodeString *item = &iter.getString(); LocalPointer<UnicodeString> ownedItem; UBool checkDistinct; int32_t itemLength = item->length(); if (!item->hasMoreChar32Than(0, itemLength, 1)) { checkDistinct = FALSE; } else if(item->charAt(itemLength - 1) == 0x2a && // '*' item->charAt(itemLength - 2) != 0x2a) { // Use a label if it is marked with one trailing star, // even if the label string sorts the same when all contractions are suppressed. ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1)); item = ownedItem.getAlias(); if (item == NULL) { errorCode = U_MEMORY_ALLOCATION_ERROR; return; } checkDistinct = FALSE; } else { checkDistinct = TRUE; } if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) { // Ignore a primary-ignorable or non-alphabetic index character. } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) { // Ignore an index character that will land in the overflow bucket. } else if (checkDistinct && collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) { // Ignore a multi-code point index character that does not sort distinctly // from the sequence of its separate characters. } else { int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_); if (insertionPoint < 0) { indexCharacters.insertElementAt( ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode); } else { const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint); if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) { indexCharacters.setElementAt( ownedString(*item, ownedItem, errorCode), insertionPoint); } } } } if (U_FAILURE(errorCode)) { return; } // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element int32_t size = indexCharacters.size() - 1; if (size > maxLabelCount_) { int32_t count = 0; int32_t old = -1; for (int32_t i = 0; i < indexCharacters.size();) { ++count; int32_t bump = count * maxLabelCount_ / size; if (bump == old) { indexCharacters.removeElementAt(i); } else { old = bump; ++i; } } } }
TESTLIB_GENERATE() { size_t n = args.get<size_t>(); int max = args.get<int>(); std::cout << n << std::endl; std::cout << separated(rnd.next<std::vector<int>>(n, 1, max), ' ') << std::endl; }