UnicodeString& U_EXPORT2 ZoneMeta::getZoneIdByMetazone(const UnicodeString &mzid, const UnicodeString ®ion, UnicodeString &result) { initializeMetaToOlson(); UBool isSet = FALSE; if (gMetaToOlson != NULL) { UErrorCode status = U_ZERO_ERROR; UChar mzidUChars[ZID_KEY_MAX]; mzid.extract(mzidUChars, ZID_KEY_MAX, status); if (U_SUCCESS(status) && status!=U_STRING_NOT_TERMINATED_WARNING) { UVector *mappings = (UVector*)uhash_get(gMetaToOlson, mzidUChars); if (mappings != NULL) { // Find a preferred time zone for the given region. for (int32_t i = 0; i < mappings->size(); i++) { MetaToOlsonMappingEntry *olsonmap = (MetaToOlsonMappingEntry*)mappings->elementAt(i); if (region.compare(olsonmap->territory, -1) == 0) { result.setTo(olsonmap->id); isSet = TRUE; break; } else if (u_strcmp(olsonmap->territory, gWorld) == 0) { result.setTo(olsonmap->id); isSet = TRUE; } } } } } if (!isSet) { result.remove(); } return result; }
int32_t TimeZoneGenericNameMatchInfo::getMatchLength(int32_t index) const { ZMatchInfo *minfo = (ZMatchInfo *)fMatches->elementAt(index); if (minfo != NULL) { return minfo->matchLength; } return -1; }
UTimeZoneGenericNameType TimeZoneGenericNameMatchInfo::getGenericNameType(int32_t index) const { GMatchInfo *minfo = (GMatchInfo *)fMatches->elementAt(index); if (minfo != NULL) { return static_cast<UTimeZoneGenericNameType>(minfo->gnameInfo->type); } return UTZGNM_UNKNOWN; }
const UnicodeString* MetaZoneIDsEnumeration::snext(UErrorCode& status) { if (U_SUCCESS(status) && fMetaZoneIDs != NULL && fPos < fLen) { unistr.setTo((const UChar*)fMetaZoneIDs->elementAt(fPos++), -1); return &unistr; } return NULL; }
UnicodeString& TimeZoneGenericNameMatchInfo::getTimeZoneID(int32_t index, UnicodeString& tzID) const { GMatchInfo *minfo = (GMatchInfo *)fMatches->elementAt(index); if (minfo != NULL && minfo->gnameInfo->tzID != NULL) { tzID.setTo(TRUE, minfo->gnameInfo->tzID, -1); } else { tzID.setToBogus(); } return tzID; }
/** * Finish constructing a transliterator: only to be called by * constructors. Before calling init(), set trans and filter to NULL. * @param list a vector of transliterator objects to be adopted. It * should NOT be empty. The list should be in declared order. That * is, it should be in the FORWARD order; if direction is REVERSE then * the list order will be reversed. * @param direction either FORWARD or REVERSE * @param fixReverseID if TRUE, then reconstruct the ID of reverse * entries by calling getID() of component entries. Some constructors * do not require this because they apply a facade ID anyway. * @param status the error code indicating success or failure */ void CompoundTransliterator::init(UVector& list, UTransDirection direction, UBool fixReverseID, UErrorCode& status) { // assert(trans == 0); // Allocate array if (U_SUCCESS(status)) { count = list.size(); trans = (Transliterator **)uprv_malloc(count * sizeof(Transliterator *)); /* test for NULL */ if (trans == 0) { status = U_MEMORY_ALLOCATION_ERROR; return; } } if (U_FAILURE(status) || trans == 0) { // assert(trans == 0); return; } // Move the transliterators from the vector into an array. // Reverse the order if necessary. int32_t i; for (i=0; i<count; ++i) { int32_t j = (direction == UTRANS_FORWARD) ? i : count - 1 - i; trans[i] = (Transliterator*) list.elementAt(j); } // Fix compoundRBTIndex for REVERSE transliterators if (compoundRBTIndex >= 0 && direction == UTRANS_REVERSE) { compoundRBTIndex = count - 1 - compoundRBTIndex; } // If the direction is UTRANS_REVERSE then we may need to fix the // ID. if (direction == UTRANS_REVERSE && fixReverseID) { UnicodeString newID; for (i=0; i<count; ++i) { if (i > 0) { newID.append(ID_DELIM); } newID.append(trans[i]->getID()); } setID(newID); } computeMaximumContextLength(); }
//----------------------------------------------------------------------------- // // bofFixup. Fixup for state tables that include {bof} beginning of input testing. // Do an swizzle similar to chaining, modifying the followPos set of // the bofNode to include the followPos nodes from other {bot} nodes // scattered through the tree. // // This function has much in common with calcChainedFollowPos(). // //----------------------------------------------------------------------------- void RBBITableBuilder::bofFixup() { if (U_FAILURE(*fStatus)) { return; } // The parse tree looks like this ... // fTree root ---> <cat> // / \ . // <cat> <#end node> // / \ . // <bofNode> rest // of tree // // We will be adding things to the followPos set of the <bofNode> // RBBINode * bofNode = fTree->fLeftChild->fLeftChild; U_ASSERT(bofNode->fType == RBBINode::leafChar); U_ASSERT(bofNode->fVal == 2); // Get all nodes that can be the start a match of the user-written rules // (excluding the fake bofNode) // We want the nodes that can start a match in the // part labeled "rest of tree" // UVector * matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet; RBBINode * startNode; int startNodeIx; for (startNodeIx = 0; startNodeIx < matchStartNodes->size(); startNodeIx++) { startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx); if (startNode->fType != RBBINode::leafChar) { continue; } if (startNode->fVal == bofNode->fVal) { // We found a leaf node corresponding to a {bof} that was // explicitly written into a rule. // Add everything from the followPos set of this node to the // followPos set of the fake bofNode at the start of the tree. // setAdd(bofNode->fFollowPos, startNode->fFollowPos); } } }
//----------------------------------------------------------------------------- // // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9 // //----------------------------------------------------------------------------- void RBBITableBuilder::calcFollowPos(RBBINode * n) { if (n == NULL || n->fType == RBBINode::leafChar || n->fType == RBBINode::endMark) { return; } calcFollowPos(n->fLeftChild); calcFollowPos(n->fRightChild); // Aho rule #1 if (n->fType == RBBINode::opCat) { RBBINode * i; // is 'i' in Aho's description uint32_t ix; UVector * LastPosOfLeftChild = n->fLeftChild->fLastPosSet; for (ix = 0; ix < (uint32_t)LastPosOfLeftChild->size(); ix++) { i = (RBBINode *)LastPosOfLeftChild->elementAt(ix); setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet); } } // Aho rule #2 if (n->fType == RBBINode::opStar || n->fType == RBBINode::opPlus) { RBBINode * i; // again, n and i are the names from Aho's description. uint32_t ix; for (ix = 0; ix < (uint32_t)n->fLastPosSet->size(); ix++) { i = (RBBINode *)n->fLastPosSet->elementAt(ix); setAdd(i->fFollowPos, n->fFirstPosSet); } } }
UnicodeString& TransliteratorRegistry::getAvailableVariant(int32_t index, const UnicodeString& source, const UnicodeString& target, UnicodeString& result) const { Hashtable *targets = (Hashtable*) specDAG.get(source); if (targets == 0) { result.truncate(0); // invalid source return result; } UVector *variants = (UVector*) targets->get(target); if (variants == 0) { result.truncate(0); // invalid target return result; } UnicodeString *v = (UnicodeString*) variants->elementAt(index); if (v == 0) { result.truncate(0); // invalid index } else { result = *v; } return result; }
//----------------------------------------------------------------------------- // // calcChainedFollowPos. Modify the previously calculated followPos sets // to implement rule chaining. NOT described by Aho // //----------------------------------------------------------------------------- void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) { UVector endMarkerNodes(*fStatus); UVector leafNodes(*fStatus); int32_t i; if (U_FAILURE(*fStatus)) { return; } // get a list of all endmarker nodes. tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus); // get a list all leaf nodes tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus); if (U_FAILURE(*fStatus)) { return; } // Get all nodes that can be the start a match, which is FirstPosition() // of the portion of the tree corresponding to user-written rules. // See the tree description in bofFixup(). RBBINode *userRuleRoot = tree; if (fRB->fSetBuilder->sawBOF()) { userRuleRoot = tree->fLeftChild->fRightChild; } U_ASSERT(userRuleRoot != NULL); UVector *matchStartNodes = userRuleRoot->fFirstPosSet; // Iteratate over all leaf nodes, // int32_t endNodeIx; int32_t startNodeIx; for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) { RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx); RBBINode *endNode = NULL; // Identify leaf nodes that correspond to overall rule match positions. // These include an endMarkerNode in their followPos sets. for (i=0; i<endMarkerNodes.size(); i++) { if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) { endNode = tNode; break; } } if (endNode == NULL) { // node wasn't an end node. Try again with the next. continue; } // We've got a node that can end a match. // Line Break Specific hack: If this node's val correspond to the $CM char class, // don't chain from it. // TODO: Add rule syntax for this behavior, get specifics out of here and // into the rule file. if (fRB->fLBCMNoChain) { UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal); if (c != -1) { // c == -1 occurs with sets containing only the {eof} marker string. ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK); if (cLBProp == U_LB_COMBINING_MARK) { continue; } } } // Now iterate over the nodes that can start a match, looking for ones // with the same char class as our ending node. RBBINode *startNode; for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) { startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx); if (startNode->fType != RBBINode::leafChar) { continue; } if (endNode->fVal == startNode->fVal) { // The end val (character class) of one possible match is the // same as the start of another. // Add all nodes from the followPos of the start node to the // followPos set of the end node, which will have the effect of // letting matches transition from a match state at endNode // to the second char of a match starting with startNode. setAdd(endNode->fFollowPos, startNode->fFollowPos); } } } }
/** * Convert the elements of the 'list' vector, which are SingleID * objects, into actual Transliterator objects. In the course of * this, some (or all) entries may be removed. If all entries * are removed, the NULL transliterator will be added. * * Delete entries with empty basicIDs; these are generated by * elements like "(A)" in the forward direction, or "A()" in * the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert * SingleID entries to actual transliterators. * * @param list vector of SingleID objects. On exit, vector * of one or more Transliterators. * @return new value of insertIndex. The index will shift if * there are empty items, like "(Lower)", with indices less than * insertIndex. */ void TransliteratorIDParser::instantiateList(UVector & list, UErrorCode & ec) { UVector tlist(ec); if (U_FAILURE(ec)) { goto RETURN; } tlist.setDeleter(_deleteTransliteratorTrIDPars); Transliterator * t; int32_t i; for (i = 0; i <= list.size(); ++i) // [sic]: i<=list.size() { // We run the loop too long by one, so we can // do an insert after the last element if (i == list.size()) { break; } SingleID * single = (SingleID *) list.elementAt(i); if (single->basicID.length() != 0) { t = single->createInstance(); if (t == NULL) { ec = U_INVALID_ID; goto RETURN; } tlist.addElement(t, ec); if (U_FAILURE(ec)) { delete t; goto RETURN; } } } // An empty list is equivalent to a NULL transliterator. if (tlist.size() == 0) { t = createBasicInstance(ANY_NULL, NULL); if (t == NULL) { // Should never happen ec = U_INTERNAL_TRANSLITERATOR_ERROR; } tlist.addElement(t, ec); if (U_FAILURE(ec)) { delete t; } } RETURN: UObjectDeleter * save = list.setDeleter(_deleteSingleID); list.removeAllElements(); if (U_SUCCESS(ec)) { list.setDeleter(_deleteTransliteratorTrIDPars); while (tlist.size() > 0) { t = (Transliterator *) tlist.orphanElementAt(0); list.addElement(t, ec); if (U_FAILURE(ec)) { delete t; list.removeAllElements(); break; } } } list.setDeleter(save); }
U_CDECL_END /** * Parse a compound ID, consisting of an optional forward global * filter, a separator, one or more single IDs delimited by * separators, an an optional reverse global filter. The * separator is a semicolon. The global filters are UnicodeSet * patterns. The reverse global filter must be enclosed in * parentheses. * @param id the pattern the parse * @param dir the direction. * @param canonID OUTPUT parameter that receives the canonical ID, * consisting of canonical IDs for all elements, as returned by * parseSingleID(), separated by semicolons. Previous contents * are discarded. * @param list OUTPUT parameter that receives a list of SingleID * objects representing the parsed IDs. Previous contents are * discarded. * @param globalFilter OUTPUT parameter that receives a pointer to * a newly created global filter for this ID in this direction, or * NULL if there is none. * @return TRUE if the parse succeeds, that is, if the entire * id is consumed without syntax error. */ UBool TransliteratorIDParser::parseCompoundID(const UnicodeString & id, int32_t dir, UnicodeString & canonID, UVector & list, UnicodeSet *& globalFilter) { UErrorCode ec = U_ZERO_ERROR; int32_t i; int32_t pos = 0; int32_t withParens = 1; list.removeAllElements(); UnicodeSet * filter; globalFilter = NULL; canonID.truncate(0); // Parse leading global filter, if any withParens = 0; // parens disallowed filter = parseGlobalFilter(id, pos, dir, withParens, &canonID); if (filter != NULL) { if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) { // Not a global filter; backup and resume canonID.truncate(0); pos = 0; } if (dir == FORWARD) { globalFilter = filter; } else { delete filter; } filter = NULL; } UBool sawDelimiter = TRUE; for (;;) { SingleID * single = parseSingleID(id, pos, dir, ec); if (single == NULL) { break; } if (dir == FORWARD) { list.addElement(single, ec); } else { list.insertElementAt(single, 0, ec); } if (U_FAILURE(ec)) { goto FAIL; } if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) { sawDelimiter = FALSE; break; } } if (list.size() == 0) { goto FAIL; } // Construct canonical ID for (i = 0; i < list.size(); ++i) { SingleID * single = (SingleID *) list.elementAt(i); canonID.append(single->canonID); if (i != (list.size() - 1)) { canonID.append(ID_DELIM); } } // Parse trailing global filter, if any, and only if we saw // a trailing delimiter after the IDs. if (sawDelimiter) { withParens = 1; // parens required filter = parseGlobalFilter(id, pos, dir, withParens, &canonID); if (filter != NULL) { // Don't require trailing ';', but parse it if present ICU_Utility::parseChar(id, pos, ID_DELIM); if (dir == REVERSE) { globalFilter = filter; } else { delete filter; } filter = NULL; } } // Trailing unparsed text is a syntax error ICU_Utility::skipWhitespace(id, pos, TRUE); if (pos != id.length()) { goto FAIL; } return TRUE; FAIL: UObjectDeleter * save = list.setDeleter(_deleteSingleID); list.removeAllElements(); list.setDeleter(save); delete globalFilter; globalFilter = NULL; return FALSE; }
void BasicTimeZone::getTimeZoneRulesAfter(UDate start, InitialTimeZoneRule*& initial, UVector*& transitionRules, UErrorCode& status) /*const*/ { if (U_FAILURE(status)) { return; } const InitialTimeZoneRule *orgini; const TimeZoneRule **orgtrs = NULL; TimeZoneTransition tzt; UBool avail; UVector *orgRules = NULL; int32_t ruleCount; TimeZoneRule *r = NULL; UBool *done = NULL; InitialTimeZoneRule *res_initial = NULL; UVector *filteredRules = NULL; UnicodeString name; int32_t i; UDate time, t; UDate *newTimes = NULL; UDate firstStart; UBool bFinalStd = FALSE, bFinalDst = FALSE; // Original transition rules ruleCount = countTransitionRules(status); if (U_FAILURE(status)) { return; } orgRules = new UVector(ruleCount, status); if (U_FAILURE(status)) { return; } orgtrs = (const TimeZoneRule**)uprv_malloc(sizeof(TimeZoneRule*)*ruleCount); if (orgtrs == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto error; } getTimeZoneRules(orgini, orgtrs, ruleCount, status); if (U_FAILURE(status)) { goto error; } for (i = 0; i < ruleCount; i++) { orgRules->addElement(orgtrs[i]->clone(), status); if (U_FAILURE(status)) { goto error; } } uprv_free(orgtrs); orgtrs = NULL; avail = getPreviousTransition(start, TRUE, tzt); if (!avail) { // No need to filter out rules only applicable to time before the start initial = orgini->clone(); transitionRules = orgRules; return; } done = (UBool*)uprv_malloc(sizeof(UBool)*ruleCount); if (done == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto error; } filteredRules = new UVector(status); if (U_FAILURE(status)) { goto error; } // Create initial rule tzt.getTo()->getName(name); res_initial = new InitialTimeZoneRule(name, tzt.getTo()->getRawOffset(), tzt.getTo()->getDSTSavings()); // Mark rules which does not need to be processed for (i = 0; i < ruleCount; i++) { r = (TimeZoneRule*)orgRules->elementAt(i); avail = r->getNextStart(start, res_initial->getRawOffset(), res_initial->getDSTSavings(), FALSE, time); done[i] = !avail; } time = start; while (!bFinalStd || !bFinalDst) { avail = getNextTransition(time, FALSE, tzt); if (!avail) { break; } UDate updatedTime = tzt.getTime(); if (updatedTime == time) { // Can get here if rules for start & end of daylight time have exactly // the same time. // TODO: fix getNextTransition() to prevent it? status = U_INVALID_STATE_ERROR; goto error; } time = updatedTime; const TimeZoneRule *toRule = tzt.getTo(); for (i = 0; i < ruleCount; i++) { r = (TimeZoneRule*)orgRules->elementAt(i); if (*r == *toRule) { break; } } if (i >= ruleCount) { // This case should never happen status = U_INVALID_STATE_ERROR; goto error; } if (done[i]) { continue; } const TimeArrayTimeZoneRule *tar = dynamic_cast<const TimeArrayTimeZoneRule *>(toRule); const AnnualTimeZoneRule *ar; if (tar != NULL) { // Get the previous raw offset and DST savings before the very first start time TimeZoneTransition tzt0; t = start; while (TRUE) { avail = getNextTransition(t, FALSE, tzt0); if (!avail) { break; } if (*(tzt0.getTo()) == *tar) { break; } t = tzt0.getTime(); } if (avail) { // Check if the entire start times to be added tar->getFirstStart(tzt.getFrom()->getRawOffset(), tzt.getFrom()->getDSTSavings(), firstStart); if (firstStart > start) { // Just add the rule as is filteredRules->addElement(tar->clone(), status); if (U_FAILURE(status)) { goto error; } } else { // Colllect transitions after the start time int32_t startTimes; DateTimeRule::TimeRuleType timeType; int32_t idx; startTimes = tar->countStartTimes(); timeType = tar->getTimeType(); for (idx = 0; idx < startTimes; idx++) { tar->getStartTimeAt(idx, t); if (timeType == DateTimeRule::STANDARD_TIME) { t -= tzt.getFrom()->getRawOffset(); } if (timeType == DateTimeRule::WALL_TIME) { t -= tzt.getFrom()->getDSTSavings(); } if (t > start) { break; } } int32_t asize = startTimes - idx; if (asize > 0) { newTimes = (UDate*)uprv_malloc(sizeof(UDate) * asize); if (newTimes == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto error; } for (int32_t newidx = 0; newidx < asize; newidx++) { tar->getStartTimeAt(idx + newidx, newTimes[newidx]); if (U_FAILURE(status)) { uprv_free(newTimes); newTimes = NULL; goto error; } } tar->getName(name); TimeArrayTimeZoneRule *newTar = new TimeArrayTimeZoneRule(name, tar->getRawOffset(), tar->getDSTSavings(), newTimes, asize, timeType); uprv_free(newTimes); filteredRules->addElement(newTar, status); if (U_FAILURE(status)) { goto error; } } } } } else if ((ar = dynamic_cast<const AnnualTimeZoneRule *>(toRule)) != NULL) { ar->getFirstStart(tzt.getFrom()->getRawOffset(), tzt.getFrom()->getDSTSavings(), firstStart); if (firstStart == tzt.getTime()) { // Just add the rule as is filteredRules->addElement(ar->clone(), status); if (U_FAILURE(status)) { goto error; } } else { // Calculate the transition year int32_t year, month, dom, dow, doy, mid; Grego::timeToFields(tzt.getTime(), year, month, dom, dow, doy, mid); // Re-create the rule ar->getName(name); AnnualTimeZoneRule *newAr = new AnnualTimeZoneRule(name, ar->getRawOffset(), ar->getDSTSavings(), *(ar->getRule()), year, ar->getEndYear()); filteredRules->addElement(newAr, status); if (U_FAILURE(status)) { goto error; } } // check if this is a final rule if (ar->getEndYear() == AnnualTimeZoneRule::MAX_YEAR) { // After bot final standard and dst rules are processed, // exit this while loop. if (ar->getDSTSavings() == 0) { bFinalStd = TRUE; } else { bFinalDst = TRUE; } } } done[i] = TRUE; } // Set the results if (orgRules != NULL) { while (!orgRules->isEmpty()) { r = (TimeZoneRule*)orgRules->orphanElementAt(0); delete r; } delete orgRules; } if (done != NULL) { uprv_free(done); } initial = res_initial; transitionRules = filteredRules; return; error: if (orgtrs != NULL) { uprv_free(orgtrs); } if (orgRules != NULL) { while (!orgRules->isEmpty()) { r = (TimeZoneRule*)orgRules->orphanElementAt(0); delete r; } delete orgRules; } if (done != NULL) { if (filteredRules != NULL) { while (!filteredRules->isEmpty()) { r = (TimeZoneRule*)filteredRules->orphanElementAt(0); delete r; } delete filteredRules; } delete res_initial; uprv_free(done); } initial = NULL; transitionRules = NULL; }
/* * Initializes the region data from the ICU resource bundles. The region data * contains the basic relationships such as which regions are known, what the numeric * codes are, any known aliases, and the territory containment data. * * If the region data has already loaded, then this method simply returns without doing * anything meaningful. */ void Region::loadRegionData() { if (regionDataIsLoaded) { return; } umtx_lock(&gRegionDataLock); if (regionDataIsLoaded) { // In case another thread gets to it before we do... umtx_unlock(&gRegionDataLock); return; } UErrorCode status = U_ZERO_ERROR; UResourceBundle* regionCodes = NULL; UResourceBundle* territoryAlias = NULL; UResourceBundle* codeMappings = NULL; UResourceBundle* worldContainment = NULL; UResourceBundle* territoryContainment = NULL; UResourceBundle* groupingContainment = NULL; DecimalFormat *df = new DecimalFormat(status); df->setParseIntegerOnly(TRUE); regionIDMap = uhash_open(uhash_hashUnicodeString,uhash_compareUnicodeString,NULL,&status); uhash_setValueDeleter(regionIDMap, deleteRegion); numericCodeMap = uhash_open(uhash_hashLong,uhash_compareLong,NULL,&status); regionAliases = uhash_open(uhash_hashUnicodeString,uhash_compareUnicodeString,NULL,&status); uhash_setKeyDeleter(regionAliases,uprv_deleteUObject); UResourceBundle *rb = ures_openDirect(NULL,"metadata",&status); regionCodes = ures_getByKey(rb,"regionCodes",NULL,&status); territoryAlias = ures_getByKey(rb,"territoryAlias",NULL,&status); UResourceBundle *rb2 = ures_openDirect(NULL,"supplementalData",&status); codeMappings = ures_getByKey(rb2,"codeMappings",NULL,&status); territoryContainment = ures_getByKey(rb2,"territoryContainment",NULL,&status); worldContainment = ures_getByKey(territoryContainment,"001",NULL,&status); groupingContainment = ures_getByKey(territoryContainment,"grouping",NULL,&status); UVector *continents = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status); while ( ures_hasNext(worldContainment) ) { UnicodeString *continentName = new UnicodeString(ures_getNextUnicodeString(worldContainment,NULL,&status)); continents->addElement(continentName,status); } UVector *groupings = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status); while ( ures_hasNext(groupingContainment) ) { UnicodeString *groupingName = new UnicodeString(ures_getNextUnicodeString(groupingContainment,NULL,&status)); groupings->addElement(groupingName,status); } while ( ures_hasNext(regionCodes) ) { UnicodeString regionID = ures_getNextUnicodeString(regionCodes,NULL,&status); Region *r = new Region(); r->idStr = regionID; r->idStr.extract(0,r->idStr.length(),r->id,sizeof(r->id),US_INV); r->type = URGN_TERRITORY; // Only temporary - figure out the real type later once the aliases are known. uhash_put(regionIDMap,(void *)&(r->idStr),(void *)r,&status); Formattable result; UErrorCode ps = U_ZERO_ERROR; df->parse(r->idStr,result,ps); if ( U_SUCCESS(ps) ) { r->code = result.getLong(); // Convert string to number uhash_iput(numericCodeMap,r->code,(void *)r,&status); r->type = URGN_SUBCONTINENT; } else { r->code = Region::UNDEFINED_NUMERIC_CODE; } } // Process the territory aliases while ( ures_hasNext(territoryAlias) ) { UResourceBundle *res = ures_getNextResource(territoryAlias,NULL,&status); const char *aliasFrom = ures_getKey(res); UnicodeString* aliasFromStr = new UnicodeString(aliasFrom); UnicodeString aliasTo = ures_getUnicodeString(res,&status); ures_close(res); Region *aliasToRegion = (Region *) uhash_get(regionIDMap,&aliasTo); Region *aliasFromRegion = (Region *)uhash_get(regionIDMap,aliasFromStr); if ( aliasToRegion != NULL && aliasFromRegion == NULL ) { // This is just an alias from some string to a region uhash_put(regionAliases,(void *)aliasFromStr, (void *)aliasToRegion,&status); } else { if ( aliasFromRegion == NULL ) { // Deprecated region code not in the master codes list - so need to create a deprecated region for it. aliasFromRegion = new Region(); aliasFromRegion->idStr.setTo(*aliasFromStr); aliasFromRegion->idStr.extract(0,aliasFromRegion->idStr.length(),aliasFromRegion->id,sizeof(aliasFromRegion->id),US_INV); uhash_put(regionIDMap,(void *)&(aliasFromRegion->idStr),(void *)aliasFromRegion,&status); Formattable result; UErrorCode ps = U_ZERO_ERROR; df->parse(aliasFromRegion->idStr,result,ps); if ( U_SUCCESS(ps) ) { aliasFromRegion->code = result.getLong(); // Convert string to number uhash_iput(numericCodeMap,aliasFromRegion->code,(void *)aliasFromRegion,&status); } else { aliasFromRegion->code = Region::UNDEFINED_NUMERIC_CODE; } aliasFromRegion->type = URGN_DEPRECATED; } else { aliasFromRegion->type = URGN_DEPRECATED; } delete aliasFromStr; aliasFromRegion->preferredValues = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status); UnicodeString currentRegion; currentRegion.remove(); for (int32_t i = 0 ; i < aliasTo.length() ; i++ ) { if ( aliasTo.charAt(i) != 0x0020 ) { currentRegion.append(aliasTo.charAt(i)); } if ( aliasTo.charAt(i) == 0x0020 || i+1 == aliasTo.length() ) { Region *target = (Region *)uhash_get(regionIDMap,(void *)¤tRegion); if (target) { UnicodeString *preferredValue = new UnicodeString(target->idStr); aliasFromRegion->preferredValues->addElement((void *)preferredValue,status); } currentRegion.remove(); } } } } // Process the code mappings - This will allow us to assign numeric codes to most of the territories. while ( ures_hasNext(codeMappings) ) { UResourceBundle *mapping = ures_getNextResource(codeMappings,NULL,&status); if ( ures_getType(mapping) == URES_ARRAY && ures_getSize(mapping) == 3) { UnicodeString codeMappingID = ures_getUnicodeStringByIndex(mapping,0,&status); UnicodeString codeMappingNumber = ures_getUnicodeStringByIndex(mapping,1,&status); UnicodeString codeMapping3Letter = ures_getUnicodeStringByIndex(mapping,2,&status); Region *r = (Region *)uhash_get(regionIDMap,(void *)&codeMappingID); if ( r ) { Formattable result; UErrorCode ps = U_ZERO_ERROR; df->parse(codeMappingNumber,result,ps); if ( U_SUCCESS(ps) ) { r->code = result.getLong(); // Convert string to number uhash_iput(numericCodeMap,r->code,(void *)r,&status); } UnicodeString *code3 = new UnicodeString(codeMapping3Letter); uhash_put(regionAliases,(void *)code3, (void *)r,&status); } } ures_close(mapping); } // Now fill in the special cases for WORLD, UNKNOWN, CONTINENTS, and GROUPINGS Region *r; r = (Region *) uhash_get(regionIDMap,(void *)&WORLD_ID); if ( r ) { r->type = URGN_WORLD; } r = (Region *) uhash_get(regionIDMap,(void *)&UNKNOWN_REGION_ID); if ( r ) { r->type = URGN_UNKNOWN; } for ( int32_t i = 0 ; i < continents->size() ; i++ ) { r = (Region *) uhash_get(regionIDMap,(void *)continents->elementAt(i)); if ( r ) { r->type = URGN_CONTINENT; } } delete continents; for ( int32_t i = 0 ; i < groupings->size() ; i++ ) { r = (Region *) uhash_get(regionIDMap,(void *)groupings->elementAt(i)); if ( r ) { r->type = URGN_GROUPING; } } delete groupings; // Special case: The region code "QO" (Outlying Oceania) is a subcontinent code added by CLDR // even though it looks like a territory code. Need to handle it here. r = (Region *) uhash_get(regionIDMap,(void *)&OUTLYING_OCEANIA_REGION_ID); if ( r ) { r->type = URGN_SUBCONTINENT; } // Load territory containment info from the supplemental data. while ( ures_hasNext(territoryContainment) ) { UResourceBundle *mapping = ures_getNextResource(territoryContainment,NULL,&status); const char *parent = ures_getKey(mapping); UnicodeString parentStr = UnicodeString(parent); Region *parentRegion = (Region *) uhash_get(regionIDMap,(void *)&parentStr); for ( int j = 0 ; j < ures_getSize(mapping); j++ ) { UnicodeString child = ures_getUnicodeStringByIndex(mapping,j,&status); Region *childRegion = (Region *) uhash_get(regionIDMap,(void *)&child); if ( parentRegion != NULL && childRegion != NULL ) { // Add the child region to the set of regions contained by the parent if (parentRegion->containedRegions == NULL) { parentRegion->containedRegions = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status); } UnicodeString *childStr = new UnicodeString(status); childStr->fastCopyFrom(childRegion->idStr); parentRegion->containedRegions->addElement((void *)childStr,status); // Set the parent region to be the containing region of the child. // Regions of type GROUPING can't be set as the parent, since another region // such as a SUBCONTINENT, CONTINENT, or WORLD must always be the parent. if ( parentRegion->type != URGN_GROUPING) { childRegion->containingRegion = parentRegion; } } } ures_close(mapping); } // Create the availableRegions lists int32_t pos = -1; while ( const UHashElement* element = uhash_nextElement(regionIDMap,&pos)) { Region *ar = (Region *)element->value.pointer; if ( availableRegions[ar->type] == NULL ) { availableRegions[ar->type] = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status); } UnicodeString *arString = new UnicodeString(ar->idStr); availableRegions[ar->type]->addElement((void *)arString,status); } ures_close(territoryContainment); ures_close(worldContainment); ures_close(groupingContainment); ures_close(codeMappings); ures_close(rb2); ures_close(territoryAlias); ures_close(regionCodes); ures_close(rb); delete df; ucln_i18n_registerCleanup(UCLN_I18N_REGION, region_cleanup); regionDataIsLoaded = true; umtx_unlock(&gRegionDataLock); }
// Build the Whole Script Confusable data // // TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class, // because everything is local to this one build function anyhow, // OR // break this function into more reasonably sized pieces, with // state in WSConfusableDataBuilder. // void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) { if (U_FAILURE(status)) { return; } URegularExpression *parseRegexp = NULL; int32_t inputLen = 0; UChar *input = NULL; int32_t lineNum = 0; UVector *scriptSets = NULL; uint32_t rtScriptSetsCount = 2; UTrie2 *anyCaseTrie = NULL; UTrie2 *lowerCaseTrie = NULL; anyCaseTrie = utrie2_open(0, 0, &status); lowerCaseTrie = utrie2_open(0, 0, &status); // The scriptSets vector provides a mapping from TRIE values to the set of scripts. // // Reserved TRIE values: // 0: Code point has no whole script confusables. // 1: Code point is of script Common or Inherited. // These code points do not participate in whole script confusable detection. // (This is logically equivalent to saying that they contain confusables in // all scripts) // // Because Trie values are indexes into the ScriptSets vector, pre-fill // vector positions 0 and 1 to avoid conflicts with the reserved values. scriptSets = new UVector(status); if (scriptSets == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } scriptSets->addElement((void *)NULL, status); scriptSets->addElement((void *)NULL, status); // Convert the user input data from UTF-8 to UChar (UTF-16) u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { goto cleanup; } status = U_ZERO_ERROR; input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); if (input == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status); parseRegexp = uregex_openC(parseExp, 0, NULL, &status); // Zap any Byte Order Mark at the start of input. Changing it to a space is benign // given the syntax of the input. if (*input == 0xfeff) { *input = 0x20; } // Parse the input, one line per iteration of this loop. uregex_setText(parseRegexp, input, inputLen, &status); while (uregex_findNext(parseRegexp, &status)) { lineNum++; UChar line[200]; uregex_group(parseRegexp, 0, line, 200, &status); if (uregex_start(parseRegexp, 1, &status) >= 0) { // this was a blank or comment line. continue; } if (uregex_start(parseRegexp, 8, &status) >= 0) { // input file syntax error. status = U_PARSE_ERROR; goto cleanup; } if (U_FAILURE(status)) { goto cleanup; } // Pick up the start and optional range end code points from the parsed line. UChar32 startCodePoint = SpoofImpl::ScanHex( input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status); UChar32 endCodePoint = startCodePoint; if (uregex_start(parseRegexp, 3, &status) >=0) { endCodePoint = SpoofImpl::ScanHex( input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status); } // Extract the two script names from the source line. We need these in an 8 bit // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on // to the ICU u_getPropertyValueEnum() function. Ugh. char srcScriptName[20]; char targScriptName[20]; extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status); extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status); UScriptCode srcScript = static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName)); UScriptCode targScript = static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName)); if (U_FAILURE(status)) { goto cleanup; } if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) { status = U_INVALID_FORMAT_ERROR; goto cleanup; } // select the table - (A) any case or (L) lower case only UTrie2 *table = anyCaseTrie; if (uregex_start(parseRegexp, 7, &status) >= 0) { table = lowerCaseTrie; } // Build the set of scripts containing confusable characters for // the code point(s) specified in this input line. // Sanity check that the script of the source code point is the same // as the source script indicated in the input file. Failure of this check is // an error in the input file. // Include the source script in the set (needed for Mixed Script Confusable detection). // UChar32 cp; for (cp=startCodePoint; cp<=endCodePoint; cp++) { int32_t setIndex = utrie2_get32(table, cp); BuilderScriptSet *bsset = NULL; if (setIndex > 0) { U_ASSERT(setIndex < scriptSets->size()); bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex)); } else { bsset = new BuilderScriptSet(); if (bsset == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } bsset->codePoint = cp; bsset->trie = table; bsset->sset = new ScriptSet(); setIndex = scriptSets->size(); bsset->index = setIndex; bsset->rindex = 0; if (bsset->sset == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } scriptSets->addElement(bsset, status); utrie2_set32(table, cp, setIndex, &status); } bsset->sset->Union(targScript); bsset->sset->Union(srcScript); if (U_FAILURE(status)) { goto cleanup; } UScriptCode cpScript = uscript_getScript(cp, &status); if (cpScript != srcScript) { status = U_INVALID_FORMAT_ERROR; goto cleanup; } } } // Eliminate duplicate script sets. At this point we have a separate // script set for every code point that had data in the input file. // // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them // // printf("Number of scriptSets: %d\n", scriptSets->size()); { int32_t duplicateCount = 0; rtScriptSetsCount = 2; for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) { BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri)); if (outerSet->index != static_cast<uint32_t>(outeri)) { // This set was already identified as a duplicate. // It will not be allocated a position in the runtime array of ScriptSets. continue; } outerSet->rindex = rtScriptSetsCount++; for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) { BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri)); if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) { delete innerSet->sset; innerSet->scriptSetOwned = FALSE; innerSet->sset = outerSet->sset; innerSet->index = outeri; innerSet->rindex = outerSet->rindex; duplicateCount++; } // But this doesn't get all. We need to fix the TRIE. } } // printf("Number of distinct script sets: %d\n", rtScriptSetsCount); } // Update the Trie values to be reflect the run time script indexes (after duplicate merging). // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets // are unused, which is why the loop index starts at 2.) { for (int32_t i=2; i<scriptSets->size(); i++) { BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); if (bSet->rindex != (uint32_t)i) { utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status); } } } // For code points with script==Common or script==Inherited, // Set the reserved value of 1 into both Tries. These characters do not participate // in Whole Script Confusable detection; this reserved value is the means // by which they are detected. { UnicodeSet ignoreSet; ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); UnicodeSet inheritedSet; inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); ignoreSet.addAll(inheritedSet); for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) { UChar32 rangeStart = ignoreSet.getRangeStart(rn); UChar32 rangeEnd = ignoreSet.getRangeEnd(rn); utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); } } // Serialize the data to the Spoof Detector { utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status); int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status); // printf("Any case Trie size: %d\n", size); if (status != U_BUFFER_OVERFLOW_ERROR) { goto cleanup; } status = U_ZERO_ERROR; spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit; spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size; spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie; void *where = spImpl->fSpoofData->reserveSpace(size, status); utrie2_serialize(anyCaseTrie, where, size, &status); utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status); size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status); // printf("Lower case Trie size: %d\n", size); if (status != U_BUFFER_OVERFLOW_ERROR) { goto cleanup; } status = U_ZERO_ERROR; spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit; spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size; spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie; where = spImpl->fSpoofData->reserveSpace(size, status); utrie2_serialize(lowerCaseTrie, where, size, &status); spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit; spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount; ScriptSet *rtScriptSets = static_cast<ScriptSet *> (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status)); uint32_t rindex = 2; for (int32_t i=2; i<scriptSets->size(); i++) { BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); if (bSet->rindex < rindex) { // We have already copied this script set to the serialized data. continue; } U_ASSERT(rindex == bSet->rindex); rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits. rindex++; } } // Open new utrie2s from the serialized data. We don't want to keep the ones // we just built because we would then have two copies of the data, one internal to // the utries that we have already constructed, and one in the serialized data area. // An alternative would be to not pre-serialize the Trie data, but that makes the // spoof detector data different, depending on how the detector was constructed. // It's simpler to keep the data always the same. spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized( UTRIE2_16_VALUE_BITS, (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie, spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, NULL, &status); spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized( UTRIE2_16_VALUE_BITS, (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie, spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, NULL, &status); cleanup: if (U_FAILURE(status)) { pe->line = lineNum; } uregex_close(parseRegexp); uprv_free(input); int32_t i; for (i=0; i<scriptSets->size(); i++) { BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); delete bsset; } delete scriptSets; utrie2_close(anyCaseTrie); utrie2_close(lowerCaseTrie); return; }