UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
    if (U_FAILURE(status)) {
        return NULL;
    }
    UVector *dest = new UVector(status);
    if (dest == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return NULL;
    }
    dest->setDeleter(uprv_deleteUObject);
    const UChar *src  = HACK_FIRST_CHARS_IN_SCRIPTS;
    const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS);
    do {
        if (U_FAILURE(status)) {
            return dest;
        }
        UnicodeString *str = new UnicodeString(src, -1);
        if (str == NULL) {
            status = U_MEMORY_ALLOCATION_ERROR;
            return dest;
        }
        dest->addElement(str, status);
        src += str->length() + 1;
    } while (src < limit);
    return dest;
}
UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
    if (U_FAILURE(status)) {
        return NULL;
    }
    UVector *dest = new UVector(status);
    if (dest == NULL) {
        if (U_SUCCESS(status)) {
            status = U_MEMORY_ALLOCATION_ERROR;
        }
        return NULL;
    }
    dest->setDeleter(uprv_deleteUObject);
    const UChar *src  = HACK_FIRST_CHARS_IN_SCRIPTS;
    const UChar *limit = src + sizeof(HACK_FIRST_CHARS_IN_SCRIPTS) / sizeof(HACK_FIRST_CHARS_IN_SCRIPTS[0]);
    do {
        if (U_FAILURE(status)) {
            return dest;
        }
        UnicodeString *str = new UnicodeString(src, -1);
        if (str == NULL) {
            status = U_MEMORY_ALLOCATION_ERROR;
        } else {
            dest->addElement(str, status);
            src += str->length() + 1;
        }
    } while (src < limit);
    dest->sortWithUComparator(sortCollateComparator, collator_, status);
    return dest;
}
Example #3
0
/**
 * Convert the elements of the 'list' vector, which are SingleID
 * objects, into actual Transliterator objects.  In the course of
 * this, some (or all) entries may be removed.  If all entries
 * are removed, the NULL transliterator will be added.
 *
 * Delete entries with empty basicIDs; these are generated by
 * elements like "(A)" in the forward direction, or "A()" in
 * the reverse.  THIS MAY RESULT IN AN EMPTY VECTOR.  Convert
 * SingleID entries to actual transliterators.
 *
 * @param list vector of SingleID objects.  On exit, vector
 * of one or more Transliterators.
 * @return new value of insertIndex.  The index will shift if
 * there are empty items, like "(Lower)", with indices less than
 * insertIndex.
 */
void TransliteratorIDParser::instantiateList(UVector & list,
        UErrorCode & ec)
{
	UVector tlist(ec);
	if (U_FAILURE(ec))
	{
		goto RETURN;
	}
	tlist.setDeleter(_deleteTransliteratorTrIDPars);

	Transliterator * t;
	int32_t i;
	for (i = 0; i <= list.size(); ++i) // [sic]: i<=list.size()
	{
		// We run the loop too long by one, so we can
		// do an insert after the last element
		if (i == list.size())
		{
			break;
		}

		SingleID * single = (SingleID *) list.elementAt(i);
		if (single->basicID.length() != 0)
		{
			t = single->createInstance();
			if (t == NULL)
			{
				ec = U_INVALID_ID;
				goto RETURN;
			}
			tlist.addElement(t, ec);
			if (U_FAILURE(ec))
			{
				delete t;
				goto RETURN;
			}
		}
	}

	// An empty list is equivalent to a NULL transliterator.
	if (tlist.size() == 0)
	{
		t = createBasicInstance(ANY_NULL, NULL);
		if (t == NULL)
		{
			// Should never happen
			ec = U_INTERNAL_TRANSLITERATOR_ERROR;
		}
		tlist.addElement(t, ec);
		if (U_FAILURE(ec))
		{
			delete t;
		}
	}

RETURN:

	UObjectDeleter * save = list.setDeleter(_deleteSingleID);
	list.removeAllElements();

	if (U_SUCCESS(ec))
	{
		list.setDeleter(_deleteTransliteratorTrIDPars);

		while (tlist.size() > 0)
		{
			t = (Transliterator *) tlist.orphanElementAt(0);
			list.addElement(t, ec);
			if (U_FAILURE(ec))
			{
				delete t;
				list.removeAllElements();
				break;
			}
		}
	}

	list.setDeleter(save);
}
Example #4
0
U_CDECL_END

/**
 * Parse a compound ID, consisting of an optional forward global
 * filter, a separator, one or more single IDs delimited by
 * separators, an an optional reverse global filter.  The
 * separator is a semicolon.  The global filters are UnicodeSet
 * patterns.  The reverse global filter must be enclosed in
 * parentheses.
 * @param id the pattern the parse
 * @param dir the direction.
 * @param canonID OUTPUT parameter that receives the canonical ID,
 * consisting of canonical IDs for all elements, as returned by
 * parseSingleID(), separated by semicolons.  Previous contents
 * are discarded.
 * @param list OUTPUT parameter that receives a list of SingleID
 * objects representing the parsed IDs.  Previous contents are
 * discarded.
 * @param globalFilter OUTPUT parameter that receives a pointer to
 * a newly created global filter for this ID in this direction, or
 * NULL if there is none.
 * @return TRUE if the parse succeeds, that is, if the entire
 * id is consumed without syntax error.
 */
UBool TransliteratorIDParser::parseCompoundID(const UnicodeString & id, int32_t dir,
        UnicodeString & canonID,
        UVector & list,
        UnicodeSet *& globalFilter)
{
	UErrorCode ec = U_ZERO_ERROR;
	int32_t i;
	int32_t pos = 0;
	int32_t withParens = 1;
	list.removeAllElements();
	UnicodeSet * filter;
	globalFilter = NULL;
	canonID.truncate(0);

	// Parse leading global filter, if any
	withParens = 0; // parens disallowed
	filter = parseGlobalFilter(id, pos, dir, withParens, &canonID);
	if (filter != NULL)
	{
		if (!ICU_Utility::parseChar(id, pos, ID_DELIM))
		{
			// Not a global filter; backup and resume
			canonID.truncate(0);
			pos = 0;
		}
		if (dir == FORWARD)
		{
			globalFilter = filter;
		}
		else
		{
			delete filter;
		}
		filter = NULL;
	}

	UBool sawDelimiter = TRUE;
	for (;;)
	{
		SingleID * single = parseSingleID(id, pos, dir, ec);
		if (single == NULL)
		{
			break;
		}
		if (dir == FORWARD)
		{
			list.addElement(single, ec);
		}
		else
		{
			list.insertElementAt(single, 0, ec);
		}
		if (U_FAILURE(ec))
		{
			goto FAIL;
		}
		if (!ICU_Utility::parseChar(id, pos, ID_DELIM))
		{
			sawDelimiter = FALSE;
			break;
		}
	}

	if (list.size() == 0)
	{
		goto FAIL;
	}

	// Construct canonical ID
	for (i = 0; i < list.size(); ++i)
	{
		SingleID * single = (SingleID *) list.elementAt(i);
		canonID.append(single->canonID);
		if (i != (list.size() - 1))
		{
			canonID.append(ID_DELIM);
		}
	}

	// Parse trailing global filter, if any, and only if we saw
	// a trailing delimiter after the IDs.
	if (sawDelimiter)
	{
		withParens = 1; // parens required
		filter = parseGlobalFilter(id, pos, dir, withParens, &canonID);
		if (filter != NULL)
		{
			// Don't require trailing ';', but parse it if present
			ICU_Utility::parseChar(id, pos, ID_DELIM);

			if (dir == REVERSE)
			{
				globalFilter = filter;
			}
			else
			{
				delete filter;
			}
			filter = NULL;
		}
	}

	// Trailing unparsed text is a syntax error
	ICU_Utility::skipWhitespace(id, pos, TRUE);
	if (pos != id.length())
	{
		goto FAIL;
	}

	return TRUE;

FAIL:
	UObjectDeleter * save = list.setDeleter(_deleteSingleID);
	list.removeAllElements();
	list.setDeleter(save);
	delete globalFilter;
	globalFilter = NULL;
	return FALSE;
}
//
//  First characters in scripts.
//  Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script.
//  The vector is sorted according to this index's collation.
//
//  This code is too slow to use, so for now hard code the data.
//    Hard coded implementation is follows.
//
UVector *AlphabeticIndex::firstStringsInScript(Collator *ruleBasedCollator, UErrorCode &status) {

    if (U_FAILURE(status)) {
        return NULL;
    }

    UnicodeString results[USCRIPT_CODE_LIMIT];
    UnicodeString LOWER_A = UNICODE_STRING_SIMPLE("a");

    UnicodeSetIterator siter(*TO_TRY);
    while (siter.next()) {
        const UnicodeString &current = siter.getString();
        Collator::EComparisonResult r = ruleBasedCollator->compare(current, LOWER_A);
        if (r < 0) {  // TODO fix; we only want "real" script characters, not
                      // symbols.
            continue;
        }

        int script = uscript_getScript(current.char32At(0), &status);
        if (results[script].length() == 0) {
            results[script] = current;
        }
        else if (ruleBasedCollator->compare(current, results[script]) < 0) {
            results[script] = current;
        }
    }

    UnicodeSet extras;
    UnicodeSet expansions;
    RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(ruleBasedCollator);
    const UCollator *uRuleBasedCollator = rbc->getUCollator();
    ucol_getContractionsAndExpansions(uRuleBasedCollator, extras.toUSet(), expansions.toUSet(), true, &status);
    extras.addAll(expansions).removeAll(*TO_TRY);
    if (extras.size() != 0) {
        const Normalizer2 *normalizer = Normalizer2::getNFKCInstance(status);
        UnicodeSetIterator extrasIter(extras);
        while (extrasIter.next()) {
            const UnicodeString &current = extrasIter.next();
            if (!TO_TRY->containsAll(current))
                continue;
            if (!normalizer->isNormalized(current, status) ||
                ruleBasedCollator->compare(current, LOWER_A) < 0) {
                continue;
            }
            int script = uscript_getScript(current.char32At(0), &status);
            if (results[script].length() == 0) {
                results[script] = current;
            } else if (ruleBasedCollator->compare(current, results[script]) < 0) {
                results[script] = current;
            }
        }
    }

    UVector *dest = new UVector(status);
    dest->setDeleter(uprv_deleteUObject);
    for (uint32_t i = 0; i < sizeof(results) / sizeof(results[0]); ++i) {
        if (results[i].length() > 0) {
            dest->addElement(results[i].clone(), status);
        }
    }
    dest->sortWithUComparator(sortCollateComparator, ruleBasedCollator, status);
    return dest;
}
void AlphabeticIndex::buildIndex(UErrorCode &status) {
    if (U_FAILURE(status)) {
        return;
    }
    if (!indexBuildRequired_) {
        return;
    }

    // Discard any already-built data.
    // This is important when the user builds and uses an index, then subsequently modifies it,
    // necessitating a rebuild.

    bucketList_->removeAllElements();
    labels_->removeAllElements();
    uhash_removeAll(alreadyIn_);
    noDistinctSorting_->clear();
    notAlphabetic_->clear();

    // first sort the incoming Labels, with a "best" ordering among items
    // that are the same according to the collator

    UVector preferenceSorting(status);   // Vector of UnicodeStrings; owned by the vector.
    preferenceSorting.setDeleter(uprv_deleteUObject);
    appendUnicodeSetToUVector(preferenceSorting, *initialLabels_, status);
    preferenceSorting.sortWithUComparator(PreferenceComparator, &status, status);

    // We now make a set of Labels.
    // Some of the input may, however, be redundant.
    // That is, we might have c, ch, d, where "ch" sorts just like "c", "h"
    // So we make a pass through, filtering out those cases.
    // TODO: filtering these out would seem to be at odds with the eventual goal
    //       of being able to split buckets that contain too many items.

    UnicodeSet labelSet;
    for (int32_t psIndex=0; psIndex<preferenceSorting.size(); psIndex++) {
        UnicodeString item = *static_cast<const UnicodeString *>(preferenceSorting.elementAt(psIndex));
        // TODO:  Since preferenceSorting was originally populated from the contents of a UnicodeSet,
        //        is it even possible for duplicates to show up in this check?
        if (labelSet.contains(item)) {
            UnicodeSetIterator itemAlreadyInIter(labelSet);
            while (itemAlreadyInIter.next()) {
                const UnicodeString &itemAlreadyIn = itemAlreadyInIter.getString();
                if (collatorPrimaryOnly_->compare(item, itemAlreadyIn) == 0) {
                    UnicodeSet *targets = static_cast<UnicodeSet *>(uhash_get(alreadyIn_, &itemAlreadyIn));
                    if (targets == NULL) {
                        // alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet<String>());
                        targets = new UnicodeSet();
                        uhash_put(alreadyIn_, itemAlreadyIn.clone(), targets, &status);
                    }
                    targets->add(item);
                    break;
                }
            }
        } else if (item.moveIndex32(0, 1) < item.length() &&  // Label contains more than one code point.
                   collatorPrimaryOnly_->compare(item, separated(item)) == 0) {
            noDistinctSorting_->add(item);
        } else if (!ALPHABETIC->containsSome(item)) {
            notAlphabetic_->add(item);
        } else {
            labelSet.add(item);
        }
    }

    // If we have no labels, hard-code a fallback default set of [A-Z]
    // This case can occur with locales that don't have exemplar character data, including root.
    // A no-labels situation will cause other problems; it needs to be avoided.
    if (labelSet.isEmpty()) {
        labelSet.add((UChar32)0x41, (UChar32)0x5A);
    }

    // Move the set of Labels from the set into a vector, and sort
    // according to the collator.

    appendUnicodeSetToUVector(*labels_, labelSet, status);
    labels_->sortWithUComparator(sortCollateComparator, collatorPrimaryOnly_, status);

    // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
    //    Implemented by copying the elements to be retained to a new UVector.

    const int32_t size = labelSet.size() - 1;
    if (size > maxLabelCount_) {
        UVector *newLabels = new UVector(status);
        newLabels->setDeleter(uprv_deleteUObject);
        int32_t count = 0;
        int32_t old = -1;
        for (int32_t srcIndex=0; srcIndex<labels_->size(); srcIndex++) {
            const UnicodeString *str = static_cast<const UnicodeString *>(labels_->elementAt(srcIndex));
            ++count;
            const int32_t bump = count * maxLabelCount_ / size;
            if (bump == old) {
                // it.remove();
            } else {
                newLabels->addElement(str->clone(), status);
                old = bump;
            }
        }
        delete labels_;
        labels_ = newLabels;
    }

    // We now know the list of labels.
    // Create a corresponding list of buckets, one per label.
    
    buildBucketList(status);    // Corresponds to Java BucketList constructor.

    // Bin the Records into the Buckets.
    bucketRecords(status);

    indexBuildRequired_ = FALSE;
    resetBucketIterator(status);
}
Example #7
0
void
ZoneMeta::initAvailableMetaZoneIDs () {
    UBool initialized;
    UMTX_CHECK(&gZoneMetaLock, gMetaZoneIDsInitialized, initialized);
    if (!initialized) {
        umtx_lock(&gZoneMetaLock);
        {
            if (!gMetaZoneIDsInitialized) {
                UErrorCode status = U_ZERO_ERROR;
                UHashtable *metaZoneIDTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, &status);
                uhash_setKeyDeleter(metaZoneIDTable, uprv_deleteUObject);
                // No valueDeleter, because the vector maintain the value objects
                UVector *metaZoneIDs = NULL;
                if (U_SUCCESS(status)) {
                    metaZoneIDs = new UVector(NULL, uhash_compareUChars, status);
                    if (metaZoneIDs == NULL) {
                        status = U_MEMORY_ALLOCATION_ERROR;
                    }
                } else {
                    uhash_close(metaZoneIDTable);
                }
                if (U_SUCCESS(status)) {
                    U_ASSERT(metaZoneIDs != NULL);
                    metaZoneIDs->setDeleter(uprv_free);

                    UResourceBundle *rb = ures_openDirect(NULL, gMetaZones, &status);
                    UResourceBundle *bundle = ures_getByKey(rb, gMapTimezonesTag, NULL, &status);
                    UResourceBundle res;
                    ures_initStackObject(&res);
                    while (U_SUCCESS(status) && ures_hasNext(bundle)) {
                        ures_getNextResource(bundle, &res, &status);
                        if (U_FAILURE(status)) {
                            break;
                        }
                        const char *mzID = ures_getKey(&res);
                        int32_t len = uprv_strlen(mzID);
                        UChar *uMzID = (UChar*)uprv_malloc(sizeof(UChar) * (len + 1));
                        if (uMzID == NULL) {
                            status = U_MEMORY_ALLOCATION_ERROR;
                            break;
                        }
                        u_charsToUChars(mzID, uMzID, len);
                        uMzID[len] = 0;
                        UnicodeString *usMzID = new UnicodeString(uMzID);
                        if (uhash_get(metaZoneIDTable, usMzID) == NULL) {
                            metaZoneIDs->addElement((void *)uMzID, status);
                            uhash_put(metaZoneIDTable, (void *)usMzID, (void *)uMzID, &status);
                        } else {
                            uprv_free(uMzID);
                            delete usMzID;
                        }
                    }
                    if (U_SUCCESS(status)) {
                        gMetaZoneIDs = metaZoneIDs;
                        gMetaZoneIDTable = metaZoneIDTable;
                        gMetaZoneIDsInitialized = TRUE;
                    } else {
                        uhash_close(metaZoneIDTable);
                        delete metaZoneIDs;
                    }
                    ures_close(&res);
                    ures_close(bundle);
                    ures_close(rb);
                }
            }
        }
        umtx_unlock(&gZoneMetaLock);
    }
}