예제 #1
0
void AlphabeticIndex::buildIndex(UErrorCode &status) {
    if (U_FAILURE(status)) {
        return;
    }
    if (!indexBuildRequired_) {
        return;
    }

    // Discard any already-built data.
    // This is important when the user builds and uses an index, then subsequently modifies it,
    // necessitating a rebuild.

    bucketList_->removeAllElements();
    labels_->removeAllElements();
    uhash_removeAll(alreadyIn_);
    noDistinctSorting_->clear();
    notAlphabetic_->clear();

    // first sort the incoming Labels, with a "best" ordering among items
    // that are the same according to the collator

    UVector preferenceSorting(status);   // Vector of UnicodeStrings; owned by the vector.
    preferenceSorting.setDeleter(uprv_deleteUObject);
    appendUnicodeSetToUVector(preferenceSorting, *initialLabels_, status);
    preferenceSorting.sortWithUComparator(PreferenceComparator, &status, status);

    // We now make a set of Labels.
    // Some of the input may, however, be redundant.
    // That is, we might have c, ch, d, where "ch" sorts just like "c", "h"
    // So we make a pass through, filtering out those cases.
    // TODO: filtering these out would seem to be at odds with the eventual goal
    //       of being able to split buckets that contain too many items.

    UnicodeSet labelSet;
    for (int32_t psIndex=0; psIndex<preferenceSorting.size(); psIndex++) {
        UnicodeString item = *static_cast<const UnicodeString *>(preferenceSorting.elementAt(psIndex));
        // TODO:  Since preferenceSorting was originally populated from the contents of a UnicodeSet,
        //        is it even possible for duplicates to show up in this check?
        if (labelSet.contains(item)) {
            UnicodeSetIterator itemAlreadyInIter(labelSet);
            while (itemAlreadyInIter.next()) {
                const UnicodeString &itemAlreadyIn = itemAlreadyInIter.getString();
                if (collatorPrimaryOnly_->compare(item, itemAlreadyIn) == 0) {
                    UnicodeSet *targets = static_cast<UnicodeSet *>(uhash_get(alreadyIn_, &itemAlreadyIn));
                    if (targets == NULL) {
                        // alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet<String>());
                        targets = new UnicodeSet();
                        uhash_put(alreadyIn_, itemAlreadyIn.clone(), targets, &status);
                    }
                    targets->add(item);
                    break;
                }
            }
        } else if (item.moveIndex32(0, 1) < item.length() &&  // Label contains more than one code point.
                   collatorPrimaryOnly_->compare(item, separated(item)) == 0) {
            noDistinctSorting_->add(item);
        } else if (!ALPHABETIC->containsSome(item)) {
            notAlphabetic_->add(item);
        } else {
            labelSet.add(item);
        }
    }

    // If we have no labels, hard-code a fallback default set of [A-Z]
    // This case can occur with locales that don't have exemplar character data, including root.
    // A no-labels situation will cause other problems; it needs to be avoided.
    if (labelSet.isEmpty()) {
        labelSet.add((UChar32)0x41, (UChar32)0x5A);
    }

    // Move the set of Labels from the set into a vector, and sort
    // according to the collator.

    appendUnicodeSetToUVector(*labels_, labelSet, status);
    labels_->sortWithUComparator(sortCollateComparator, collatorPrimaryOnly_, status);

    // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
    //    Implemented by copying the elements to be retained to a new UVector.

    const int32_t size = labelSet.size() - 1;
    if (size > maxLabelCount_) {
        UVector *newLabels = new UVector(status);
        newLabels->setDeleter(uprv_deleteUObject);
        int32_t count = 0;
        int32_t old = -1;
        for (int32_t srcIndex=0; srcIndex<labels_->size(); srcIndex++) {
            const UnicodeString *str = static_cast<const UnicodeString *>(labels_->elementAt(srcIndex));
            ++count;
            const int32_t bump = count * maxLabelCount_ / size;
            if (bump == old) {
                // it.remove();
            } else {
                newLabels->addElement(str->clone(), status);
                old = bump;
            }
        }
        delete labels_;
        labels_ = newLabels;
    }

    // We now know the list of labels.
    // Create a corresponding list of buckets, one per label.
    
    buildBucketList(status);    // Corresponds to Java BucketList constructor.

    // Bin the Records into the Buckets.
    bucketRecords(status);

    indexBuildRequired_ = FALSE;
    resetBucketIterator(status);
}
예제 #2
0
void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const {
    const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode);
    if (U_FAILURE(errorCode)) { return; }

    const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0);
    const UnicodeString &overflowBoundary =
        *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1);

    // We make a sorted array of elements.
    // Some of the input may be redundant.
    // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
    // We filter out those cases.
    UnicodeSetIterator iter(*initialLabels_);
    while (iter.next()) {
        const UnicodeString *item = &iter.getString();
        LocalPointer<UnicodeString> ownedItem;
        UBool checkDistinct;
        int32_t itemLength = item->length();
        if (!item->hasMoreChar32Than(0, itemLength, 1)) {
            checkDistinct = FALSE;
        } else if(item->charAt(itemLength - 1) == 0x2a &&  // '*'
                item->charAt(itemLength - 2) != 0x2a) {
            // Use a label if it is marked with one trailing star,
            // even if the label string sorts the same when all contractions are suppressed.
            ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1));
            item = ownedItem.getAlias();
            if (item == NULL) {
                errorCode = U_MEMORY_ALLOCATION_ERROR;
                return;
            }
            checkDistinct = FALSE;
        } else {
            checkDistinct = TRUE;
        }
        if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
            // Ignore a primary-ignorable or non-alphabetic index character.
        } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) {
            // Ignore an index character that will land in the overflow bucket.
        } else if (checkDistinct &&
                collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) {
            // Ignore a multi-code point index character that does not sort distinctly
            // from the sequence of its separate characters.
        } else {
            int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_);
            if (insertionPoint < 0) {
                indexCharacters.insertElementAt(
                    ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
            } else {
                const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
                if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) {
                    indexCharacters.setElementAt(
                        ownedString(*item, ownedItem, errorCode), insertionPoint);
                }
            }
        }
    }
    if (U_FAILURE(errorCode)) { return; }

    // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element

    int32_t size = indexCharacters.size() - 1;
    if (size > maxLabelCount_) {
        int32_t count = 0;
        int32_t old = -1;
        for (int32_t i = 0; i < indexCharacters.size();) {
            ++count;
            int32_t bump = count * maxLabelCount_ / size;
            if (bump == old) {
                indexCharacters.removeElementAt(i);
            } else {
                old = bump;
                ++i;
            }
        }
    }
}
예제 #3
0
TESTLIB_GENERATE() {
    size_t n = args.get<size_t>();
    int max = args.get<int>();
    std::cout << n << std::endl;
    std::cout << separated(rnd.next<std::vector<int>>(n, 1, max), ' ') << std::endl;
}