UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { if (U_FAILURE(status)) { return NULL; } UVector *dest = new UVector(status); if (dest == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return NULL; } dest->setDeleter(uprv_deleteUObject); const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS; const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS); do { if (U_FAILURE(status)) { return dest; } UnicodeString *str = new UnicodeString(src, -1); if (str == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return dest; } dest->addElement(str, status); src += str->length() + 1; } while (src < limit); return dest; }
UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { if (U_FAILURE(status)) { return NULL; } UVector *dest = new UVector(status); if (dest == NULL) { if (U_SUCCESS(status)) { status = U_MEMORY_ALLOCATION_ERROR; } return NULL; } dest->setDeleter(uprv_deleteUObject); const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS; const UChar *limit = src + sizeof(HACK_FIRST_CHARS_IN_SCRIPTS) / sizeof(HACK_FIRST_CHARS_IN_SCRIPTS[0]); do { if (U_FAILURE(status)) { return dest; } UnicodeString *str = new UnicodeString(src, -1); if (str == NULL) { status = U_MEMORY_ALLOCATION_ERROR; } else { dest->addElement(str, status); src += str->length() + 1; } } while (src < limit); dest->sortWithUComparator(sortCollateComparator, collator_, status); return dest; }
/** * Convert the elements of the 'list' vector, which are SingleID * objects, into actual Transliterator objects. In the course of * this, some (or all) entries may be removed. If all entries * are removed, the NULL transliterator will be added. * * Delete entries with empty basicIDs; these are generated by * elements like "(A)" in the forward direction, or "A()" in * the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert * SingleID entries to actual transliterators. * * @param list vector of SingleID objects. On exit, vector * of one or more Transliterators. * @return new value of insertIndex. The index will shift if * there are empty items, like "(Lower)", with indices less than * insertIndex. */ void TransliteratorIDParser::instantiateList(UVector & list, UErrorCode & ec) { UVector tlist(ec); if (U_FAILURE(ec)) { goto RETURN; } tlist.setDeleter(_deleteTransliteratorTrIDPars); Transliterator * t; int32_t i; for (i = 0; i <= list.size(); ++i) // [sic]: i<=list.size() { // We run the loop too long by one, so we can // do an insert after the last element if (i == list.size()) { break; } SingleID * single = (SingleID *) list.elementAt(i); if (single->basicID.length() != 0) { t = single->createInstance(); if (t == NULL) { ec = U_INVALID_ID; goto RETURN; } tlist.addElement(t, ec); if (U_FAILURE(ec)) { delete t; goto RETURN; } } } // An empty list is equivalent to a NULL transliterator. if (tlist.size() == 0) { t = createBasicInstance(ANY_NULL, NULL); if (t == NULL) { // Should never happen ec = U_INTERNAL_TRANSLITERATOR_ERROR; } tlist.addElement(t, ec); if (U_FAILURE(ec)) { delete t; } } RETURN: UObjectDeleter * save = list.setDeleter(_deleteSingleID); list.removeAllElements(); if (U_SUCCESS(ec)) { list.setDeleter(_deleteTransliteratorTrIDPars); while (tlist.size() > 0) { t = (Transliterator *) tlist.orphanElementAt(0); list.addElement(t, ec); if (U_FAILURE(ec)) { delete t; list.removeAllElements(); break; } } } list.setDeleter(save); }
U_CDECL_END /** * Parse a compound ID, consisting of an optional forward global * filter, a separator, one or more single IDs delimited by * separators, an an optional reverse global filter. The * separator is a semicolon. The global filters are UnicodeSet * patterns. The reverse global filter must be enclosed in * parentheses. * @param id the pattern the parse * @param dir the direction. * @param canonID OUTPUT parameter that receives the canonical ID, * consisting of canonical IDs for all elements, as returned by * parseSingleID(), separated by semicolons. Previous contents * are discarded. * @param list OUTPUT parameter that receives a list of SingleID * objects representing the parsed IDs. Previous contents are * discarded. * @param globalFilter OUTPUT parameter that receives a pointer to * a newly created global filter for this ID in this direction, or * NULL if there is none. * @return TRUE if the parse succeeds, that is, if the entire * id is consumed without syntax error. */ UBool TransliteratorIDParser::parseCompoundID(const UnicodeString & id, int32_t dir, UnicodeString & canonID, UVector & list, UnicodeSet *& globalFilter) { UErrorCode ec = U_ZERO_ERROR; int32_t i; int32_t pos = 0; int32_t withParens = 1; list.removeAllElements(); UnicodeSet * filter; globalFilter = NULL; canonID.truncate(0); // Parse leading global filter, if any withParens = 0; // parens disallowed filter = parseGlobalFilter(id, pos, dir, withParens, &canonID); if (filter != NULL) { if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) { // Not a global filter; backup and resume canonID.truncate(0); pos = 0; } if (dir == FORWARD) { globalFilter = filter; } else { delete filter; } filter = NULL; } UBool sawDelimiter = TRUE; for (;;) { SingleID * single = parseSingleID(id, pos, dir, ec); if (single == NULL) { break; } if (dir == FORWARD) { list.addElement(single, ec); } else { list.insertElementAt(single, 0, ec); } if (U_FAILURE(ec)) { goto FAIL; } if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) { sawDelimiter = FALSE; break; } } if (list.size() == 0) { goto FAIL; } // Construct canonical ID for (i = 0; i < list.size(); ++i) { SingleID * single = (SingleID *) list.elementAt(i); canonID.append(single->canonID); if (i != (list.size() - 1)) { canonID.append(ID_DELIM); } } // Parse trailing global filter, if any, and only if we saw // a trailing delimiter after the IDs. if (sawDelimiter) { withParens = 1; // parens required filter = parseGlobalFilter(id, pos, dir, withParens, &canonID); if (filter != NULL) { // Don't require trailing ';', but parse it if present ICU_Utility::parseChar(id, pos, ID_DELIM); if (dir == REVERSE) { globalFilter = filter; } else { delete filter; } filter = NULL; } } // Trailing unparsed text is a syntax error ICU_Utility::skipWhitespace(id, pos, TRUE); if (pos != id.length()) { goto FAIL; } return TRUE; FAIL: UObjectDeleter * save = list.setDeleter(_deleteSingleID); list.removeAllElements(); list.setDeleter(save); delete globalFilter; globalFilter = NULL; return FALSE; }
// // First characters in scripts. // Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script. // The vector is sorted according to this index's collation. // // This code is too slow to use, so for now hard code the data. // Hard coded implementation is follows. // UVector *AlphabeticIndex::firstStringsInScript(Collator *ruleBasedCollator, UErrorCode &status) { if (U_FAILURE(status)) { return NULL; } UnicodeString results[USCRIPT_CODE_LIMIT]; UnicodeString LOWER_A = UNICODE_STRING_SIMPLE("a"); UnicodeSetIterator siter(*TO_TRY); while (siter.next()) { const UnicodeString ¤t = siter.getString(); Collator::EComparisonResult r = ruleBasedCollator->compare(current, LOWER_A); if (r < 0) { // TODO fix; we only want "real" script characters, not // symbols. continue; } int script = uscript_getScript(current.char32At(0), &status); if (results[script].length() == 0) { results[script] = current; } else if (ruleBasedCollator->compare(current, results[script]) < 0) { results[script] = current; } } UnicodeSet extras; UnicodeSet expansions; RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(ruleBasedCollator); const UCollator *uRuleBasedCollator = rbc->getUCollator(); ucol_getContractionsAndExpansions(uRuleBasedCollator, extras.toUSet(), expansions.toUSet(), true, &status); extras.addAll(expansions).removeAll(*TO_TRY); if (extras.size() != 0) { const Normalizer2 *normalizer = Normalizer2::getNFKCInstance(status); UnicodeSetIterator extrasIter(extras); while (extrasIter.next()) { const UnicodeString ¤t = extrasIter.next(); if (!TO_TRY->containsAll(current)) continue; if (!normalizer->isNormalized(current, status) || ruleBasedCollator->compare(current, LOWER_A) < 0) { continue; } int script = uscript_getScript(current.char32At(0), &status); if (results[script].length() == 0) { results[script] = current; } else if (ruleBasedCollator->compare(current, results[script]) < 0) { results[script] = current; } } } UVector *dest = new UVector(status); dest->setDeleter(uprv_deleteUObject); for (uint32_t i = 0; i < sizeof(results) / sizeof(results[0]); ++i) { if (results[i].length() > 0) { dest->addElement(results[i].clone(), status); } } dest->sortWithUComparator(sortCollateComparator, ruleBasedCollator, status); return dest; }
void AlphabeticIndex::buildIndex(UErrorCode &status) { if (U_FAILURE(status)) { return; } if (!indexBuildRequired_) { return; } // Discard any already-built data. // This is important when the user builds and uses an index, then subsequently modifies it, // necessitating a rebuild. bucketList_->removeAllElements(); labels_->removeAllElements(); uhash_removeAll(alreadyIn_); noDistinctSorting_->clear(); notAlphabetic_->clear(); // first sort the incoming Labels, with a "best" ordering among items // that are the same according to the collator UVector preferenceSorting(status); // Vector of UnicodeStrings; owned by the vector. preferenceSorting.setDeleter(uprv_deleteUObject); appendUnicodeSetToUVector(preferenceSorting, *initialLabels_, status); preferenceSorting.sortWithUComparator(PreferenceComparator, &status, status); // We now make a set of Labels. // Some of the input may, however, be redundant. // That is, we might have c, ch, d, where "ch" sorts just like "c", "h" // So we make a pass through, filtering out those cases. // TODO: filtering these out would seem to be at odds with the eventual goal // of being able to split buckets that contain too many items. UnicodeSet labelSet; for (int32_t psIndex=0; psIndex<preferenceSorting.size(); psIndex++) { UnicodeString item = *static_cast<const UnicodeString *>(preferenceSorting.elementAt(psIndex)); // TODO: Since preferenceSorting was originally populated from the contents of a UnicodeSet, // is it even possible for duplicates to show up in this check? if (labelSet.contains(item)) { UnicodeSetIterator itemAlreadyInIter(labelSet); while (itemAlreadyInIter.next()) { const UnicodeString &itemAlreadyIn = itemAlreadyInIter.getString(); if (collatorPrimaryOnly_->compare(item, itemAlreadyIn) == 0) { UnicodeSet *targets = static_cast<UnicodeSet *>(uhash_get(alreadyIn_, &itemAlreadyIn)); if (targets == NULL) { // alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet<String>()); targets = new UnicodeSet(); uhash_put(alreadyIn_, itemAlreadyIn.clone(), targets, &status); } targets->add(item); break; } } } else if (item.moveIndex32(0, 1) < item.length() && // Label contains more than one code point. collatorPrimaryOnly_->compare(item, separated(item)) == 0) { noDistinctSorting_->add(item); } else if (!ALPHABETIC->containsSome(item)) { notAlphabetic_->add(item); } else { labelSet.add(item); } } // If we have no labels, hard-code a fallback default set of [A-Z] // This case can occur with locales that don't have exemplar character data, including root. // A no-labels situation will cause other problems; it needs to be avoided. if (labelSet.isEmpty()) { labelSet.add((UChar32)0x41, (UChar32)0x5A); } // Move the set of Labels from the set into a vector, and sort // according to the collator. appendUnicodeSetToUVector(*labels_, labelSet, status); labels_->sortWithUComparator(sortCollateComparator, collatorPrimaryOnly_, status); // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element // Implemented by copying the elements to be retained to a new UVector. const int32_t size = labelSet.size() - 1; if (size > maxLabelCount_) { UVector *newLabels = new UVector(status); newLabels->setDeleter(uprv_deleteUObject); int32_t count = 0; int32_t old = -1; for (int32_t srcIndex=0; srcIndex<labels_->size(); srcIndex++) { const UnicodeString *str = static_cast<const UnicodeString *>(labels_->elementAt(srcIndex)); ++count; const int32_t bump = count * maxLabelCount_ / size; if (bump == old) { // it.remove(); } else { newLabels->addElement(str->clone(), status); old = bump; } } delete labels_; labels_ = newLabels; } // We now know the list of labels. // Create a corresponding list of buckets, one per label. buildBucketList(status); // Corresponds to Java BucketList constructor. // Bin the Records into the Buckets. bucketRecords(status); indexBuildRequired_ = FALSE; resetBucketIterator(status); }
void ZoneMeta::initAvailableMetaZoneIDs () { UBool initialized; UMTX_CHECK(&gZoneMetaLock, gMetaZoneIDsInitialized, initialized); if (!initialized) { umtx_lock(&gZoneMetaLock); { if (!gMetaZoneIDsInitialized) { UErrorCode status = U_ZERO_ERROR; UHashtable *metaZoneIDTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, &status); uhash_setKeyDeleter(metaZoneIDTable, uprv_deleteUObject); // No valueDeleter, because the vector maintain the value objects UVector *metaZoneIDs = NULL; if (U_SUCCESS(status)) { metaZoneIDs = new UVector(NULL, uhash_compareUChars, status); if (metaZoneIDs == NULL) { status = U_MEMORY_ALLOCATION_ERROR; } } else { uhash_close(metaZoneIDTable); } if (U_SUCCESS(status)) { U_ASSERT(metaZoneIDs != NULL); metaZoneIDs->setDeleter(uprv_free); UResourceBundle *rb = ures_openDirect(NULL, gMetaZones, &status); UResourceBundle *bundle = ures_getByKey(rb, gMapTimezonesTag, NULL, &status); UResourceBundle res; ures_initStackObject(&res); while (U_SUCCESS(status) && ures_hasNext(bundle)) { ures_getNextResource(bundle, &res, &status); if (U_FAILURE(status)) { break; } const char *mzID = ures_getKey(&res); int32_t len = uprv_strlen(mzID); UChar *uMzID = (UChar*)uprv_malloc(sizeof(UChar) * (len + 1)); if (uMzID == NULL) { status = U_MEMORY_ALLOCATION_ERROR; break; } u_charsToUChars(mzID, uMzID, len); uMzID[len] = 0; UnicodeString *usMzID = new UnicodeString(uMzID); if (uhash_get(metaZoneIDTable, usMzID) == NULL) { metaZoneIDs->addElement((void *)uMzID, status); uhash_put(metaZoneIDTable, (void *)usMzID, (void *)uMzID, &status); } else { uprv_free(uMzID); delete usMzID; } } if (U_SUCCESS(status)) { gMetaZoneIDs = metaZoneIDs; gMetaZoneIDTable = metaZoneIDTable; gMetaZoneIDsInitialized = TRUE; } else { uhash_close(metaZoneIDTable); delete metaZoneIDs; } ures_close(&res); ures_close(bundle); ures_close(rb); } } } umtx_unlock(&gZoneMetaLock); } }