Exemplo n.º 1
0
int32_t
TimeZoneGenericNameMatchInfo::size() const {
    if (fMatches == NULL) {
        return 0;
    }
    return fMatches->size();
}
Exemplo n.º 2
0
 void updateVisibleIDs(Hashtable& result, UErrorCode& status) const {
     if (U_SUCCESS(_status)) {
         for (int32_t i = 0; i < _ids.size(); ++i) {
             result.put(*(UnicodeString*)_ids[i], (void*)this, status);
         }
     }
 }
UnicodeString& U_EXPORT2
ZoneMeta::getZoneIdByMetazone(const UnicodeString &mzid, const UnicodeString &region, UnicodeString &result) {
    initializeMetaToOlson();
    UBool isSet = FALSE;
    if (gMetaToOlson != NULL) {
        UErrorCode status = U_ZERO_ERROR;
        UChar mzidUChars[ZID_KEY_MAX];
        mzid.extract(mzidUChars, ZID_KEY_MAX, status);
        if (U_SUCCESS(status) && status!=U_STRING_NOT_TERMINATED_WARNING) {
            UVector *mappings = (UVector*)uhash_get(gMetaToOlson, mzidUChars);
            if (mappings != NULL) {
                // Find a preferred time zone for the given region.
                for (int32_t i = 0; i < mappings->size(); i++) {
                    MetaToOlsonMappingEntry *olsonmap = (MetaToOlsonMappingEntry*)mappings->elementAt(i);
                    if (region.compare(olsonmap->territory, -1) == 0) {
                        result.setTo(olsonmap->id);
                        isSet = TRUE;
                        break;
                    } else if (u_strcmp(olsonmap->territory, gWorld) == 0) {
                        result.setTo(olsonmap->id);
                        isSet = TRUE;
                    }
                }
            }
        }
    }
    if (!isSet) {
        result.remove();
    }
    return result;
}
Exemplo n.º 4
0
void CompactData::getUniquePatterns(UVector &output, UErrorCode &status) const {
    U_ASSERT(output.isEmpty());
    // NOTE: In C++, this is done more manually with a UVector.
    // In Java, we can take advantage of JDK HashSet.
    for (auto pattern : patterns) {
        if (pattern == nullptr || pattern == USE_FALLBACK) {
            continue;
        }

        // Insert pattern into the UVector if the UVector does not already contain the pattern.
        // Search the UVector from the end since identical patterns are likely to be adjacent.
        for (int32_t i = output.size() - 1; i >= 0; i--) {
            if (u_strcmp(pattern, static_cast<const UChar *>(output[i])) == 0) {
                goto continue_outer;
            }
        }

        // The string was not found; add it to the UVector.
        // ANDY: This requires a const_cast.  Why?
        output.addElement(const_cast<UChar *>(pattern), status);

        continue_outer:
        continue;
    }
}
Exemplo n.º 5
0
//-----------------------------------------------------------------------------
//
//  sortedAdd  Add a value to a vector of sorted values (ints).
//             Do not replicate entries; if the value is already there, do not
//                add a second one.
//             Lazily create the vector if it does not already exist.
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::sortedAdd(UVector ** vector, int32_t val)
{
	int32_t i;

	if (*vector == NULL)
	{
		*vector = new UVector(*fStatus);
	}
	if (*vector == NULL || U_FAILURE(*fStatus))
	{
		return;
	}
	UVector * vec = *vector;
	int32_t  vSize = vec->size();
	for (i = 0; i < vSize; i++)
	{
		int32_t valAtI = vec->elementAti(i);
		if (valAtI == val)
		{
			// The value is already in the vector.  Don't add it again.
			return;
		}
		if (valAtI > val)
		{
			break;
		}
	}
	vec->insertElementAt(val, i, *fStatus);
}
Exemplo n.º 6
0
UVector*
RuleBasedTimeZone::copyRules(UVector* source) {
    if (source == NULL) {
        return NULL;
    }
    UErrorCode ec = U_ZERO_ERROR;
    int32_t size = source->size();
    UVector *rules = new UVector(size, ec);
    if (U_FAILURE(ec)) {
        return NULL;
    }
    int32_t i;
    for (i = 0; i < size; i++) {
        rules->addElement(((TimeZoneRule*)source->elementAt(i))->clone(), ec);
        if (U_FAILURE(ec)) {
            break;
        }
    }
    if (U_FAILURE(ec)) {
        // In case of error, clean up
        for (i = 0; i < rules->size(); i++) {
            TimeZoneRule *rule = (TimeZoneRule*)rules->orphanElementAt(i);
            delete rule;
        }
        delete rules;
        return NULL;
    }
    return rules;
}
Exemplo n.º 7
0
UBool UVector::containsNone(const UVector& other) const {
    for (int32_t i=0; i<other.size(); ++i) {
        if (indexOf(other.elements[i]) >= 0) {
            return FALSE;
        }
    }
    return TRUE;
}
Exemplo n.º 8
0
int32_t TransliteratorRegistry::countAvailableVariants(const UnicodeString& source,
                                                       const UnicodeString& target) const {
    Hashtable *targets = (Hashtable*) specDAG.get(source);
    if (targets == 0) {
        return 0;
    }
    UVector *variants = (UVector*) targets->get(target);
    // variants may be 0 if the source/target are invalid
    return (variants == 0) ? 0 : variants->size();
}
Exemplo n.º 9
0
UBool UVector::removeAll(const UVector& other) {
    UBool changed = FALSE;
    for (int32_t i=0; i<other.size(); ++i) {
        int32_t j = indexOf(other.elements[i]);
        if (j >= 0) {
            removeElementAt(j);
            changed = TRUE;
        }
    }
    return changed;
}
Exemplo n.º 10
0
uint32_t UXML2Document::getElement(UVector<UString>& velement, const char* tag, uint32_t tag_len)
{
   U_TRACE(0, "UXML2Document::getElement(%p,%.*S,%u)", &velement, tag_len, tag, tag_len)

   U_INTERNAL_ASSERT_POINTER(tag)

   UString element;
   uint32_t n = velement.size(), pos = 0;

   while (true)
      {
      pos = getElement(element, pos, tag, tag_len);

      if (pos == U_NOT_FOUND) break;

      velement.push(element);
      }

   uint32_t result = velement.size() - n;

   U_RETURN(result);
}
Exemplo n.º 11
0
/**
 * Finish constructing a transliterator: only to be called by
 * constructors.  Before calling init(), set trans and filter to NULL.
 * @param list a vector of transliterator objects to be adopted.  It
 * should NOT be empty.  The list should be in declared order.  That
 * is, it should be in the FORWARD order; if direction is REVERSE then
 * the list order will be reversed.
 * @param direction either FORWARD or REVERSE
 * @param fixReverseID if TRUE, then reconstruct the ID of reverse
 * entries by calling getID() of component entries.  Some constructors
 * do not require this because they apply a facade ID anyway.
 * @param status the error code indicating success or failure
 */
void CompoundTransliterator::init(UVector& list,
                                  UTransDirection direction,
                                  UBool fixReverseID,
                                  UErrorCode& status) {
    // assert(trans == 0);

    // Allocate array
    if (U_SUCCESS(status)) {
        count = list.size();
        trans = (Transliterator **)uprv_malloc(count * sizeof(Transliterator *));
        /* test for NULL */
        if (trans == 0) {
            status = U_MEMORY_ALLOCATION_ERROR;
            return;
        }
    }

    if (U_FAILURE(status) || trans == 0) {
         // assert(trans == 0);
        return;
    }

    // Move the transliterators from the vector into an array.
    // Reverse the order if necessary.
    int32_t i;
    for (i=0; i<count; ++i) {
        int32_t j = (direction == UTRANS_FORWARD) ? i : count - 1 - i;
        trans[i] = (Transliterator*) list.elementAt(j);
    }

    // Fix compoundRBTIndex for REVERSE transliterators
    if (compoundRBTIndex >= 0 && direction == UTRANS_REVERSE) {
        compoundRBTIndex = count - 1 - compoundRBTIndex;
    }

    // If the direction is UTRANS_REVERSE then we may need to fix the
    // ID.
    if (direction == UTRANS_REVERSE && fixReverseID) {
        UnicodeString newID;
        for (i=0; i<count; ++i) {
            if (i > 0) {
                newID.append(ID_DELIM);
            }
            newID.append(trans[i]->getID());
        }
        setID(newID);
    }

    computeMaximumContextLength();
}
Exemplo n.º 12
0
//-----------------------------------------------------------------------------
//
//   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
//                Do an swizzle similar to chaining, modifying the followPos set of
//                the bofNode to include the followPos nodes from other {bot} nodes
//                scattered through the tree.
//
//                This function has much in common with calcChainedFollowPos().
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::bofFixup()
{

	if (U_FAILURE(*fStatus))
	{
		return;
	}

	//   The parse tree looks like this ...
	//         fTree root  --->       <cat>
	//                               /     \       .
	//                            <cat>   <#end node>
	//                           /     \  .
	//                     <bofNode>   rest
	//                               of tree
	//
	//    We will be adding things to the followPos set of the <bofNode>
	//
	RBBINode * bofNode = fTree->fLeftChild->fLeftChild;
	U_ASSERT(bofNode->fType == RBBINode::leafChar);
	U_ASSERT(bofNode->fVal == 2);

	// Get all nodes that can be the start a match of the user-written rules
	//  (excluding the fake bofNode)
	//  We want the nodes that can start a match in the
	//     part labeled "rest of tree"
	//
	UVector * matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;

	RBBINode * startNode;
	int       startNodeIx;
	for (startNodeIx = 0; startNodeIx < matchStartNodes->size(); startNodeIx++)
	{
		startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
		if (startNode->fType != RBBINode::leafChar)
		{
			continue;
		}

		if (startNode->fVal == bofNode->fVal)
		{
			//  We found a leaf node corresponding to a {bof} that was
			//    explicitly written into a rule.
			//  Add everything from the followPos set of this node to the
			//    followPos set of the fake bofNode at the start of the tree.
			//
			setAdd(bofNode->fFollowPos, startNode->fFollowPos);
		}
	}
}
Exemplo n.º 13
0
//-----------------------------------------------------------------------------
//
//   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::calcFollowPos(RBBINode * n)
{
	if (n == NULL ||
	    n->fType == RBBINode::leafChar ||
	    n->fType == RBBINode::endMark)
	{
		return;
	}

	calcFollowPos(n->fLeftChild);
	calcFollowPos(n->fRightChild);

	// Aho rule #1
	if (n->fType == RBBINode::opCat)
	{
		RBBINode * i;  // is 'i' in Aho's description
		uint32_t     ix;

		UVector * LastPosOfLeftChild = n->fLeftChild->fLastPosSet;

		for (ix = 0; ix < (uint32_t)LastPosOfLeftChild->size(); ix++)
		{
			i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
			setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
		}
	}

	// Aho rule #2
	if (n->fType == RBBINode::opStar ||
	    n->fType == RBBINode::opPlus)
	{
		RBBINode  * i;  // again, n and i are the names from Aho's description.
		uint32_t    ix;

		for (ix = 0; ix < (uint32_t)n->fLastPosSet->size(); ix++)
		{
			i = (RBBINode *)n->fLastPosSet->elementAt(ix);
			setAdd(i->fFollowPos, n->fFirstPosSet);
		}
	}



}
Exemplo n.º 14
0
void RBBITableBuilder::printRuleStatusTable() {
    int32_t  thisRecord = 0;
    int32_t  nextRecord = 0;
    int      i;
    UVector  *tbl = fRB->fRuleStatusVals;

    RBBIDebugPrintf("index |  tags \n");
    RBBIDebugPrintf("-------------------\n");

    while (nextRecord < tbl->size()) {
        thisRecord = nextRecord;
        nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
        RBBIDebugPrintf("%4d   ", thisRecord);
        for (i=thisRecord+1; i<nextRecord; i++) {
            RBBIDebugPrintf("  %5d", tbl->elementAti(i));
        }
        RBBIDebugPrintf("\n");
    }
    RBBIDebugPrintf("\n\n");
}
Exemplo n.º 15
0
/**
 * Remove a source-target/variant from the specDAG.
 */
void TransliteratorRegistry::removeSTV(const UnicodeString& source,
                                       const UnicodeString& target,
                                       const UnicodeString& variant) {
    // assert(source.length() > 0);
    // assert(target.length() > 0);
//    UErrorCode status = U_ZERO_ERROR;
    Hashtable *targets = (Hashtable*) specDAG.get(source);
    if (targets == 0) {
        return; // should never happen for valid s-t/v
    }
    UVector *variants = (UVector*) targets->get(target);
    if (variants == 0) {
        return; // should never happen for valid s-t/v
    }
    variants->removeElement((void*) &variant);
    if (variants->size() == 0) {
        targets->remove(target); // should delete variants
        if (targets->count() == 0) {
            specDAG.remove(source); // should delete targets
        }
    }
}
Exemplo n.º 16
0
//-----------------------------------------------------------------------------
//
//  mergeRuleStatusVals
//
//      Update the global table of rule status {tag} values
//      The rule builder has a global vector of status values that are common
//      for all tables.  Merge the ones from this table into the global set.
//
//-----------------------------------------------------------------------------
void  RBBITableBuilder::mergeRuleStatusVals() {
    //
    //  The basic outline of what happens here is this...
    //
    //    for each state in this state table
    //       if the status tag list for this state is in the global statuses list
    //           record where and
    //           continue with the next state
    //       else
    //           add the tag list for this state to the global list.
    //
    int i;
    int n;

    // Pre-set a single tag of {0} into the table.
    //   We will need this as a default, for rule sets with no explicit tagging.
    if (fRB->fRuleStatusVals->size() == 0) {
        fRB->fRuleStatusVals->addElement(1, *fStatus);  // Num of statuses in group
        fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus);  //   and our single status of zero
    }

    //    For each state
    for (n=0; n<fDStates->size(); n++) {
        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
        UVector *thisStatesTagValues = sd->fTagVals;
        if (thisStatesTagValues == NULL) {
            // No tag values are explicitly associated with this state.
            //   Set the default tag value.
            sd->fTagsIdx = 0;
            continue;
        }

        // There are tag(s) associated with this state.
        //   fTagsIdx will be the index into the global tag list for this state's tag values.
        //   Initial value of -1 flags that we haven't got it set yet.
        sd->fTagsIdx = -1;
        int32_t  thisTagGroupStart = 0;   // indexes into the global rule status vals list
        int32_t  nextTagGroupStart = 0;

        // Loop runs once per group of tags in the global list
        while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
            thisTagGroupStart = nextTagGroupStart;
            nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
            if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
                // The number of tags for this state is different from
                //    the number of tags in this group from the global list.
                //    Continue with the next group from the global list.
                continue;
            }
            // The lengths match, go ahead and compare the actual tag values
            //    between this state and the group from the global list.
            for (i=0; i<thisStatesTagValues->size(); i++) {
                if (thisStatesTagValues->elementAti(i) !=
                    fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
                    // Mismatch.
                    break;
                }
            }

            if (i == thisStatesTagValues->size()) {
                // We found a set of tag values in the global list that match
                //   those for this state.  Use them.
                sd->fTagsIdx = thisTagGroupStart;
                break;
            }
        }

        if (sd->fTagsIdx == -1) {
            // No suitable entry in the global tag list already.  Add one
            sd->fTagsIdx = fRB->fRuleStatusVals->size();
            fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
            for (i=0; i<thisStatesTagValues->size(); i++) {
                fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
            }
        }
    }
}
Exemplo n.º 17
0
//-----------------------------------------------------------------------------
//
//   buildStateTable()    Determine the set of runtime DFA states and the
//                        transition tables for these states, by the algorithm
//                        of fig. 3.44 in Aho.
//
//                        Most of the comments are quotes of Aho's psuedo-code.
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::buildStateTable() {
    if (U_FAILURE(*fStatus)) {
        return;
    }
    //
    // Add a dummy state 0 - the stop state.  Not from Aho.
    int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
    RBBIStateDescriptor *failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
    failState->fPositions = new UVector(*fStatus);
    if (U_FAILURE(*fStatus)) {
        return;
    }
    fDStates->addElement(failState, *fStatus);
    if (U_FAILURE(*fStatus)) {
        return;
    }

    // initially, the only unmarked state in Dstates is firstpos(root),
    //       where toot is the root of the syntax tree for (r)#;
    RBBIStateDescriptor *initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
    if (U_FAILURE(*fStatus)) {
        return;
    }
    initialState->fPositions = new UVector(*fStatus);
    if (U_FAILURE(*fStatus)) {
        return;
    }
    setAdd(initialState->fPositions, fTree->fFirstPosSet);
    fDStates->addElement(initialState, *fStatus);
    if (U_FAILURE(*fStatus)) {
        return;
    }

    // while there is an unmarked state T in Dstates do begin
    for (;;) {
        RBBIStateDescriptor *T = NULL;
        int32_t              tx;
        for (tx=1; tx<fDStates->size(); tx++) {
            RBBIStateDescriptor *temp;
            temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
            if (temp->fMarked == FALSE) {
                T = temp;
                break;
            }
        }
        if (T == NULL) {
            break;
        }

        // mark T;
        T->fMarked = TRUE;

        // for each input symbol a do begin
        int32_t  a;
        for (a = 1; a<=lastInputSymbol; a++) {
            // let U be the set of positions that are in followpos(p)
            //    for some position p in T
            //    such that the symbol at position p is a;
            UVector    *U = NULL;
            RBBINode   *p;
            int32_t     px;
            for (px=0; px<T->fPositions->size(); px++) {
                p = (RBBINode *)T->fPositions->elementAt(px);
                if ((p->fType == RBBINode::leafChar) &&  (p->fVal == a)) {
                    if (U == NULL) {
                        U = new UVector(*fStatus);
                    }
                    setAdd(U, p->fFollowPos);
                }
            }

            // if U is not empty and not in DStates then
            int32_t  ux = 0;
            UBool    UinDstates = FALSE;
            if (U != NULL) {
                U_ASSERT(U->size() > 0);
                int  ix;
                for (ix=0; ix<fDStates->size(); ix++) {
                    RBBIStateDescriptor *temp2;
                    temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
                    if (setEquals(U, temp2->fPositions)) {
                        delete U;
                        U  = temp2->fPositions;
                        ux = ix;
                        UinDstates = TRUE;
                        break;
                    }
                }

                // Add U as an unmarked state to Dstates
                if (!UinDstates)
                {
                    RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
                    if (U_FAILURE(*fStatus)) {
                        return;
                    }
                    newState->fPositions = U;
                    fDStates->addElement(newState, *fStatus);
                    if (U_FAILURE(*fStatus)) {
                        return;
                    }
                    ux = fDStates->size()-1;
                }

                // Dtran[T, a] := U;
                T->fDtran->setElementAt(ux, a);
            }
        }
    }
}
Exemplo n.º 18
0
//-----------------------------------------------------------------------------
//
//   calcChainedFollowPos.    Modify the previously calculated followPos sets
//                            to implement rule chaining.  NOT described by Aho
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {

    UVector         endMarkerNodes(*fStatus);
    UVector         leafNodes(*fStatus);
    int32_t         i;

    if (U_FAILURE(*fStatus)) {
        return;
    }

    // get a list of all endmarker nodes.
    tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);

    // get a list all leaf nodes
    tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
    if (U_FAILURE(*fStatus)) {
        return;
    }

    // Get all nodes that can be the start a match, which is FirstPosition()
    // of the portion of the tree corresponding to user-written rules.
    // See the tree description in bofFixup().
    RBBINode *userRuleRoot = tree;
    if (fRB->fSetBuilder->sawBOF()) {
        userRuleRoot = tree->fLeftChild->fRightChild;
    }
    U_ASSERT(userRuleRoot != NULL);
    UVector *matchStartNodes = userRuleRoot->fFirstPosSet;


    // Iteratate over all leaf nodes,
    //
    int32_t  endNodeIx;
    int32_t  startNodeIx;

    for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
        RBBINode *tNode   = (RBBINode *)leafNodes.elementAt(endNodeIx);
        RBBINode *endNode = NULL;

        // Identify leaf nodes that correspond to overall rule match positions.
        //   These include an endMarkerNode in their followPos sets.
        for (i=0; i<endMarkerNodes.size(); i++) {
            if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
                endNode = tNode;
                break;
            }
        }
        if (endNode == NULL) {
            // node wasn't an end node.  Try again with the next.
            continue;
        }

        // We've got a node that can end a match.

        // Line Break Specific hack:  If this node's val correspond to the $CM char class,
        //                            don't chain from it.
        // TODO:  Add rule syntax for this behavior, get specifics out of here and
        //        into the rule file.
        if (fRB->fLBCMNoChain) {
            UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
            if (c != -1) {
                // c == -1 occurs with sets containing only the {eof} marker string.
                ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
                if (cLBProp == U_LB_COMBINING_MARK) {
                    continue;
                }
            }
        }


        // Now iterate over the nodes that can start a match, looking for ones
        //   with the same char class as our ending node.
        RBBINode *startNode;
        for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
            startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
            if (startNode->fType != RBBINode::leafChar) {
                continue;
            }

            if (endNode->fVal == startNode->fVal) {
                // The end val (character class) of one possible match is the
                //   same as the start of another.

                // Add all nodes from the followPos of the start node to the
                //  followPos set of the end node, which will have the effect of
                //  letting matches transition from a match state at endNode
                //  to the second char of a match starting with startNode.
                setAdd(endNode->fFollowPos, startNode->fFollowPos);
            }
        }
    }
}
Exemplo n.º 19
0
/**
 * Convert the elements of the 'list' vector, which are SingleID
 * objects, into actual Transliterator objects.  In the course of
 * this, some (or all) entries may be removed.  If all entries
 * are removed, the NULL transliterator will be added.
 *
 * Delete entries with empty basicIDs; these are generated by
 * elements like "(A)" in the forward direction, or "A()" in
 * the reverse.  THIS MAY RESULT IN AN EMPTY VECTOR.  Convert
 * SingleID entries to actual transliterators.
 *
 * @param list vector of SingleID objects.  On exit, vector
 * of one or more Transliterators.
 * @return new value of insertIndex.  The index will shift if
 * there are empty items, like "(Lower)", with indices less than
 * insertIndex.
 */
void TransliteratorIDParser::instantiateList(UVector & list,
        UErrorCode & ec)
{
	UVector tlist(ec);
	if (U_FAILURE(ec))
	{
		goto RETURN;
	}
	tlist.setDeleter(_deleteTransliteratorTrIDPars);

	Transliterator * t;
	int32_t i;
	for (i = 0; i <= list.size(); ++i) // [sic]: i<=list.size()
	{
		// We run the loop too long by one, so we can
		// do an insert after the last element
		if (i == list.size())
		{
			break;
		}

		SingleID * single = (SingleID *) list.elementAt(i);
		if (single->basicID.length() != 0)
		{
			t = single->createInstance();
			if (t == NULL)
			{
				ec = U_INVALID_ID;
				goto RETURN;
			}
			tlist.addElement(t, ec);
			if (U_FAILURE(ec))
			{
				delete t;
				goto RETURN;
			}
		}
	}

	// An empty list is equivalent to a NULL transliterator.
	if (tlist.size() == 0)
	{
		t = createBasicInstance(ANY_NULL, NULL);
		if (t == NULL)
		{
			// Should never happen
			ec = U_INTERNAL_TRANSLITERATOR_ERROR;
		}
		tlist.addElement(t, ec);
		if (U_FAILURE(ec))
		{
			delete t;
		}
	}

RETURN:

	UObjectDeleter * save = list.setDeleter(_deleteSingleID);
	list.removeAllElements();

	if (U_SUCCESS(ec))
	{
		list.setDeleter(_deleteTransliteratorTrIDPars);

		while (tlist.size() > 0)
		{
			t = (Transliterator *) tlist.orphanElementAt(0);
			list.addElement(t, ec);
			if (U_FAILURE(ec))
			{
				delete t;
				list.removeAllElements();
				break;
			}
		}
	}

	list.setDeleter(save);
}
Exemplo n.º 20
0
U_CDECL_END

/**
 * Parse a compound ID, consisting of an optional forward global
 * filter, a separator, one or more single IDs delimited by
 * separators, an an optional reverse global filter.  The
 * separator is a semicolon.  The global filters are UnicodeSet
 * patterns.  The reverse global filter must be enclosed in
 * parentheses.
 * @param id the pattern the parse
 * @param dir the direction.
 * @param canonID OUTPUT parameter that receives the canonical ID,
 * consisting of canonical IDs for all elements, as returned by
 * parseSingleID(), separated by semicolons.  Previous contents
 * are discarded.
 * @param list OUTPUT parameter that receives a list of SingleID
 * objects representing the parsed IDs.  Previous contents are
 * discarded.
 * @param globalFilter OUTPUT parameter that receives a pointer to
 * a newly created global filter for this ID in this direction, or
 * NULL if there is none.
 * @return TRUE if the parse succeeds, that is, if the entire
 * id is consumed without syntax error.
 */
UBool TransliteratorIDParser::parseCompoundID(const UnicodeString & id, int32_t dir,
        UnicodeString & canonID,
        UVector & list,
        UnicodeSet *& globalFilter)
{
	UErrorCode ec = U_ZERO_ERROR;
	int32_t i;
	int32_t pos = 0;
	int32_t withParens = 1;
	list.removeAllElements();
	UnicodeSet * filter;
	globalFilter = NULL;
	canonID.truncate(0);

	// Parse leading global filter, if any
	withParens = 0; // parens disallowed
	filter = parseGlobalFilter(id, pos, dir, withParens, &canonID);
	if (filter != NULL)
	{
		if (!ICU_Utility::parseChar(id, pos, ID_DELIM))
		{
			// Not a global filter; backup and resume
			canonID.truncate(0);
			pos = 0;
		}
		if (dir == FORWARD)
		{
			globalFilter = filter;
		}
		else
		{
			delete filter;
		}
		filter = NULL;
	}

	UBool sawDelimiter = TRUE;
	for (;;)
	{
		SingleID * single = parseSingleID(id, pos, dir, ec);
		if (single == NULL)
		{
			break;
		}
		if (dir == FORWARD)
		{
			list.addElement(single, ec);
		}
		else
		{
			list.insertElementAt(single, 0, ec);
		}
		if (U_FAILURE(ec))
		{
			goto FAIL;
		}
		if (!ICU_Utility::parseChar(id, pos, ID_DELIM))
		{
			sawDelimiter = FALSE;
			break;
		}
	}

	if (list.size() == 0)
	{
		goto FAIL;
	}

	// Construct canonical ID
	for (i = 0; i < list.size(); ++i)
	{
		SingleID * single = (SingleID *) list.elementAt(i);
		canonID.append(single->canonID);
		if (i != (list.size() - 1))
		{
			canonID.append(ID_DELIM);
		}
	}

	// Parse trailing global filter, if any, and only if we saw
	// a trailing delimiter after the IDs.
	if (sawDelimiter)
	{
		withParens = 1; // parens required
		filter = parseGlobalFilter(id, pos, dir, withParens, &canonID);
		if (filter != NULL)
		{
			// Don't require trailing ';', but parse it if present
			ICU_Utility::parseChar(id, pos, ID_DELIM);

			if (dir == REVERSE)
			{
				globalFilter = filter;
			}
			else
			{
				delete filter;
			}
			filter = NULL;
		}
	}

	// Trailing unparsed text is a syntax error
	ICU_Utility::skipWhitespace(id, pos, TRUE);
	if (pos != id.length())
	{
		goto FAIL;
	}

	return TRUE;

FAIL:
	UObjectDeleter * save = list.setDeleter(_deleteSingleID);
	list.removeAllElements();
	list.setDeleter(save);
	delete globalFilter;
	globalFilter = NULL;
	return FALSE;
}
Exemplo n.º 21
0
 virtual int32_t count(UErrorCode& status) const {
     return upToDate(status) ? _ids.size() : 0;
 }
Exemplo n.º 22
0
void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const {
    const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode);
    if (U_FAILURE(errorCode)) { return; }

    const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0);
    const UnicodeString &overflowBoundary =
        *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1);

    // We make a sorted array of elements.
    // Some of the input may be redundant.
    // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
    // We filter out those cases.
    UnicodeSetIterator iter(*initialLabels_);
    while (iter.next()) {
        const UnicodeString *item = &iter.getString();
        LocalPointer<UnicodeString> ownedItem;
        UBool checkDistinct;
        int32_t itemLength = item->length();
        if (!item->hasMoreChar32Than(0, itemLength, 1)) {
            checkDistinct = FALSE;
        } else if(item->charAt(itemLength - 1) == 0x2a &&  // '*'
                item->charAt(itemLength - 2) != 0x2a) {
            // Use a label if it is marked with one trailing star,
            // even if the label string sorts the same when all contractions are suppressed.
            ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1));
            item = ownedItem.getAlias();
            if (item == NULL) {
                errorCode = U_MEMORY_ALLOCATION_ERROR;
                return;
            }
            checkDistinct = FALSE;
        } else {
            checkDistinct = TRUE;
        }
        if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
            // Ignore a primary-ignorable or non-alphabetic index character.
        } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) {
            // Ignore an index character that will land in the overflow bucket.
        } else if (checkDistinct &&
                collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) {
            // Ignore a multi-code point index character that does not sort distinctly
            // from the sequence of its separate characters.
        } else {
            int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_);
            if (insertionPoint < 0) {
                indexCharacters.insertElementAt(
                    ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
            } else {
                const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
                if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) {
                    indexCharacters.setElementAt(
                        ownedString(*item, ownedItem, errorCode), insertionPoint);
                }
            }
        }
    }
    if (U_FAILURE(errorCode)) { return; }

    // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element

    int32_t size = indexCharacters.size() - 1;
    if (size > maxLabelCount_) {
        int32_t count = 0;
        int32_t old = -1;
        for (int32_t i = 0; i < indexCharacters.size();) {
            ++count;
            int32_t bump = count * maxLabelCount_ / size;
            if (bump == old) {
                indexCharacters.removeElementAt(i);
            } else {
                old = bump;
                ++i;
            }
        }
    }
}
Exemplo n.º 23
0
//  Build the Whole Script Confusable data
//
//     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,
//                         because everything is local to this one build function anyhow,
//                           OR
//                         break this function into more reasonably sized pieces, with
//                         state in WSConfusableDataBuilder.
//
void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
          int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) 
{
    if (U_FAILURE(status)) {
        return;
    }
    URegularExpression *parseRegexp = NULL;
    int32_t             inputLen    = 0;
    UChar              *input       = NULL;
    int32_t             lineNum     = 0;
    
    UVector            *scriptSets        = NULL;
    uint32_t            rtScriptSetsCount = 2;

    UTrie2             *anyCaseTrie   = NULL;
    UTrie2             *lowerCaseTrie = NULL;

    anyCaseTrie = utrie2_open(0, 0, &status);
    lowerCaseTrie = utrie2_open(0, 0, &status);
    

    // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
    //
    // Reserved TRIE values:
    //   0:  Code point has no whole script confusables.
    //   1:  Code point is of script Common or Inherited.
    //       These code points do not participate in whole script confusable detection.
    //       (This is logically equivalent to saying that they contain confusables in
    //        all scripts)
    //
    // Because Trie values are indexes into the ScriptSets vector, pre-fill
    // vector positions 0 and 1 to avoid conflicts with the reserved values.
    
    scriptSets = new UVector(status);
    if (scriptSets == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        goto cleanup;
    }
    scriptSets->addElement((void *)NULL, status);
    scriptSets->addElement((void *)NULL, status);

    // Convert the user input data from UTF-8 to UChar (UTF-16)
    u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
    if (status != U_BUFFER_OVERFLOW_ERROR) {
        goto cleanup;
    }
    status = U_ZERO_ERROR;
    input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
    if (input == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        goto cleanup;
    }
    u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);



    parseRegexp = uregex_openC(parseExp, 0, NULL, &status);
    
    // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
    //   given the syntax of the input.
    if (*input == 0xfeff) {
        *input = 0x20;
    }

    // Parse the input, one line per iteration of this loop.
    uregex_setText(parseRegexp, input, inputLen, &status);
    while (uregex_findNext(parseRegexp, &status)) {
        lineNum++;
        UChar  line[200];
        uregex_group(parseRegexp, 0, line, 200, &status);
        if (uregex_start(parseRegexp, 1, &status) >= 0) {
            // this was a blank or comment line.
            continue;
        }
        if (uregex_start(parseRegexp, 8, &status) >= 0) {
            // input file syntax error.
            status = U_PARSE_ERROR;
            goto cleanup;
        }
        if (U_FAILURE(status)) {
            goto cleanup;
        }

        // Pick up the start and optional range end code points from the parsed line.
        UChar32  startCodePoint = SpoofImpl::ScanHex(
            input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
        UChar32  endCodePoint = startCodePoint;
        if (uregex_start(parseRegexp, 3, &status) >=0) {
            endCodePoint = SpoofImpl::ScanHex(
                input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
        }

        // Extract the two script names from the source line.  We need these in an 8 bit
        //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
        //   to the ICU u_getPropertyValueEnum() function.  Ugh.
        char  srcScriptName[20];
        char  targScriptName[20];
        extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
        extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
        UScriptCode srcScript  =
            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
        UScriptCode targScript =
            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
        if (U_FAILURE(status)) {
            goto cleanup;
        }
        if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
            status = U_INVALID_FORMAT_ERROR;
            goto cleanup;
        }

        // select the table - (A) any case or (L) lower case only
        UTrie2 *table = anyCaseTrie;
        if (uregex_start(parseRegexp, 7, &status) >= 0) {
            table = lowerCaseTrie;
        }

        // Build the set of scripts containing confusable characters for
        //   the code point(s) specified in this input line.
        // Sanity check that the script of the source code point is the same
        //   as the source script indicated in the input file.  Failure of this check is
        //   an error in the input file.
        // Include the source script in the set (needed for Mixed Script Confusable detection).
        //
        UChar32 cp;
        for (cp=startCodePoint; cp<=endCodePoint; cp++) {
            int32_t setIndex = utrie2_get32(table, cp);
            BuilderScriptSet *bsset = NULL;
            if (setIndex > 0) {
                U_ASSERT(setIndex < scriptSets->size());
                bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
            } else {
                bsset = new BuilderScriptSet();
                if (bsset == NULL) {
                    status = U_MEMORY_ALLOCATION_ERROR;
                    goto cleanup;
                }
                bsset->codePoint = cp;
                bsset->trie = table;
                bsset->sset = new ScriptSet();
                setIndex = scriptSets->size();
                bsset->index = setIndex;
                bsset->rindex = 0;
                if (bsset->sset == NULL) {
                    status = U_MEMORY_ALLOCATION_ERROR;
                    goto cleanup;
                }
                scriptSets->addElement(bsset, status);
                utrie2_set32(table, cp, setIndex, &status);
            }
            bsset->sset->Union(targScript);
            bsset->sset->Union(srcScript);

            if (U_FAILURE(status)) {
                goto cleanup;
            }
            UScriptCode cpScript = uscript_getScript(cp, &status);
            if (cpScript != srcScript) {
                status = U_INVALID_FORMAT_ERROR;
                goto cleanup;
            }
        }
    }

    // Eliminate duplicate script sets.  At this point we have a separate
    // script set for every code point that had data in the input file.
    //
    // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
    //
    // printf("Number of scriptSets: %d\n", scriptSets->size());
    {
        int32_t duplicateCount = 0;
        rtScriptSetsCount = 2;
        for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
            BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
            if (outerSet->index != static_cast<uint32_t>(outeri)) {
                // This set was already identified as a duplicate.
                //   It will not be allocated a position in the runtime array of ScriptSets.
                continue;
            }
            outerSet->rindex = rtScriptSetsCount++;
            for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
                BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
                if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
                    delete innerSet->sset;
                    innerSet->scriptSetOwned = FALSE;
                    innerSet->sset = outerSet->sset;
                    innerSet->index = outeri;
                    innerSet->rindex = outerSet->rindex;
                    duplicateCount++;
                }
                // But this doesn't get all.  We need to fix the TRIE.
            }
        }
        // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
    }

    

    // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
    //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
    //     are unused, which is why the loop index starts at 2.)
    {
        for (int32_t i=2; i<scriptSets->size(); i++) {
            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
            if (bSet->rindex != (uint32_t)i) {
                utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
            }
        }
    }

    // For code points with script==Common or script==Inherited,
    //   Set the reserved value of 1 into both Tries.  These characters do not participate
    //   in Whole Script Confusable detection; this reserved value is the means
    //   by which they are detected.
    {
        UnicodeSet ignoreSet;
        ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
        UnicodeSet inheritedSet;
        inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
        ignoreSet.addAll(inheritedSet);
        for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
            UChar32 rangeStart = ignoreSet.getRangeStart(rn);
            UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);
            utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);
            utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
        }
    }

    // Serialize the data to the Spoof Detector
    {
        utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);
        int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
        // printf("Any case Trie size: %d\n", size);
        if (status != U_BUFFER_OVERFLOW_ERROR) {
            goto cleanup;
        }
        status = U_ZERO_ERROR;
        spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
        spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
        spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
        void *where = spImpl->fSpoofData->reserveSpace(size, status);
        utrie2_serialize(anyCaseTrie, where, size, &status);
        
        utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
        size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
        // printf("Lower case Trie size: %d\n", size);
        if (status != U_BUFFER_OVERFLOW_ERROR) {
            goto cleanup;
        }
        status = U_ZERO_ERROR;
        spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
        spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
        spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
        where = spImpl->fSpoofData->reserveSpace(size, status);
        utrie2_serialize(lowerCaseTrie, where, size, &status);

        spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
        spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
        ScriptSet *rtScriptSets =  static_cast<ScriptSet *>
            (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
        uint32_t rindex = 2;
        for (int32_t i=2; i<scriptSets->size(); i++) {
            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
            if (bSet->rindex < rindex) {
                // We have already copied this script set to the serialized data.
                continue;
            }
            U_ASSERT(rindex == bSet->rindex);
            rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.
            rindex++;
        }
    }

    // Open new utrie2s from the serialized data.  We don't want to keep the ones
    //   we just built because we would then have two copies of the data, one internal to
    //   the utries that we have already constructed, and one in the serialized data area.
    //   An alternative would be to not pre-serialize the Trie data, but that makes the
    //   spoof detector data different, depending on how the detector was constructed.
    //   It's simpler to keep the data always the same.
    
    spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
            UTRIE2_16_VALUE_BITS,
            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
            NULL,
            &status);

    spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
            UTRIE2_16_VALUE_BITS,
            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
            NULL,
            &status);

    

cleanup:
    if (U_FAILURE(status)) {
        pe->line = lineNum;
    }
    uregex_close(parseRegexp);
    uprv_free(input);

    int32_t i;
    for (i=0; i<scriptSets->size(); i++) {
        BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
        delete bsset;
    }
    delete scriptSets;
    utrie2_close(anyCaseTrie);
    utrie2_close(lowerCaseTrie);
    return;
}
Exemplo n.º 24
0
/*
 * Initializes the region data from the ICU resource bundles.  The region data
 * contains the basic relationships such as which regions are known, what the numeric
 * codes are, any known aliases, and the territory containment data.
 * 
 * If the region data has already loaded, then this method simply returns without doing
 * anything meaningful.
 */
void Region::loadRegionData() {

    if (regionDataIsLoaded) {
        return;
    }

    umtx_lock(&gRegionDataLock);

    if (regionDataIsLoaded) { // In case another thread gets to it before we do...
        umtx_unlock(&gRegionDataLock);
        return;
    }

   
    UErrorCode status = U_ZERO_ERROR;

    UResourceBundle* regionCodes = NULL;
    UResourceBundle* territoryAlias = NULL;
    UResourceBundle* codeMappings = NULL;
    UResourceBundle* worldContainment = NULL;
    UResourceBundle* territoryContainment = NULL;
    UResourceBundle* groupingContainment = NULL;

    DecimalFormat *df = new DecimalFormat(status);
    df->setParseIntegerOnly(TRUE);

    regionIDMap = uhash_open(uhash_hashUnicodeString,uhash_compareUnicodeString,NULL,&status);
    uhash_setValueDeleter(regionIDMap, deleteRegion);

    numericCodeMap = uhash_open(uhash_hashLong,uhash_compareLong,NULL,&status);

    regionAliases = uhash_open(uhash_hashUnicodeString,uhash_compareUnicodeString,NULL,&status);
    uhash_setKeyDeleter(regionAliases,uprv_deleteUObject);

    UResourceBundle *rb = ures_openDirect(NULL,"metadata",&status);
    regionCodes = ures_getByKey(rb,"regionCodes",NULL,&status);
    territoryAlias = ures_getByKey(rb,"territoryAlias",NULL,&status);
    
    UResourceBundle *rb2 = ures_openDirect(NULL,"supplementalData",&status);
    codeMappings = ures_getByKey(rb2,"codeMappings",NULL,&status);

    territoryContainment = ures_getByKey(rb2,"territoryContainment",NULL,&status);
    worldContainment = ures_getByKey(territoryContainment,"001",NULL,&status);
    groupingContainment = ures_getByKey(territoryContainment,"grouping",NULL,&status);

    UVector *continents = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status);

    while ( ures_hasNext(worldContainment) ) {
        UnicodeString *continentName = new UnicodeString(ures_getNextUnicodeString(worldContainment,NULL,&status));
        continents->addElement(continentName,status);
    }

    UVector *groupings = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status);
    while ( ures_hasNext(groupingContainment) ) {
        UnicodeString *groupingName = new UnicodeString(ures_getNextUnicodeString(groupingContainment,NULL,&status));
        groupings->addElement(groupingName,status);
    }

    while ( ures_hasNext(regionCodes) ) {
        UnicodeString regionID = ures_getNextUnicodeString(regionCodes,NULL,&status);
        Region *r = new Region();
        r->idStr = regionID;
        r->idStr.extract(0,r->idStr.length(),r->id,sizeof(r->id),US_INV);
        r->type = URGN_TERRITORY; // Only temporary - figure out the real type later once the aliases are known.

        uhash_put(regionIDMap,(void *)&(r->idStr),(void *)r,&status);
        Formattable result;
        UErrorCode ps = U_ZERO_ERROR;
        df->parse(r->idStr,result,ps);
        if ( U_SUCCESS(ps) ) {
            r->code = result.getLong(); // Convert string to number
            uhash_iput(numericCodeMap,r->code,(void *)r,&status);
            r->type = URGN_SUBCONTINENT;
        } else {
            r->code = Region::UNDEFINED_NUMERIC_CODE;
        }
    }


    // Process the territory aliases
    while ( ures_hasNext(territoryAlias) ) {
        UResourceBundle *res = ures_getNextResource(territoryAlias,NULL,&status);
        const char *aliasFrom = ures_getKey(res);
        UnicodeString* aliasFromStr = new UnicodeString(aliasFrom);
        UnicodeString aliasTo = ures_getUnicodeString(res,&status);
        ures_close(res);

        Region *aliasToRegion = (Region *) uhash_get(regionIDMap,&aliasTo);
        Region *aliasFromRegion = (Region *)uhash_get(regionIDMap,aliasFromStr);

        if ( aliasToRegion != NULL && aliasFromRegion == NULL ) { // This is just an alias from some string to a region
            uhash_put(regionAliases,(void *)aliasFromStr, (void *)aliasToRegion,&status);
        } else {
            if ( aliasFromRegion == NULL ) { // Deprecated region code not in the master codes list - so need to create a deprecated region for it.
                aliasFromRegion = new Region();
                aliasFromRegion->idStr.setTo(*aliasFromStr);
                aliasFromRegion->idStr.extract(0,aliasFromRegion->idStr.length(),aliasFromRegion->id,sizeof(aliasFromRegion->id),US_INV);
                uhash_put(regionIDMap,(void *)&(aliasFromRegion->idStr),(void *)aliasFromRegion,&status);
                Formattable result;
                UErrorCode ps = U_ZERO_ERROR;
                df->parse(aliasFromRegion->idStr,result,ps);
                if ( U_SUCCESS(ps) ) {
                    aliasFromRegion->code = result.getLong(); // Convert string to number
                    uhash_iput(numericCodeMap,aliasFromRegion->code,(void *)aliasFromRegion,&status);
                } else {
                    aliasFromRegion->code = Region::UNDEFINED_NUMERIC_CODE;
                }
                aliasFromRegion->type = URGN_DEPRECATED;
            } else {
                aliasFromRegion->type = URGN_DEPRECATED;
            }
            delete aliasFromStr;

            aliasFromRegion->preferredValues = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status);
            UnicodeString currentRegion;
            currentRegion.remove();
            for (int32_t i = 0 ; i < aliasTo.length() ; i++ ) {
                if ( aliasTo.charAt(i) != 0x0020 ) {
                    currentRegion.append(aliasTo.charAt(i));
                }
                if ( aliasTo.charAt(i) == 0x0020 || i+1 == aliasTo.length() ) {
                    Region *target = (Region *)uhash_get(regionIDMap,(void *)&currentRegion);
                    if (target) {
                        UnicodeString *preferredValue = new UnicodeString(target->idStr);
                        aliasFromRegion->preferredValues->addElement((void *)preferredValue,status);
                    }
                    currentRegion.remove();
                }
            }
        }
    }

    // Process the code mappings - This will allow us to assign numeric codes to most of the territories.
    while ( ures_hasNext(codeMappings) ) {
        UResourceBundle *mapping = ures_getNextResource(codeMappings,NULL,&status);
        if ( ures_getType(mapping) == URES_ARRAY && ures_getSize(mapping) == 3) {
            UnicodeString codeMappingID = ures_getUnicodeStringByIndex(mapping,0,&status);
            UnicodeString codeMappingNumber = ures_getUnicodeStringByIndex(mapping,1,&status);
            UnicodeString codeMapping3Letter = ures_getUnicodeStringByIndex(mapping,2,&status);

            Region *r = (Region *)uhash_get(regionIDMap,(void *)&codeMappingID);
            if ( r ) {
                Formattable result;
                UErrorCode ps = U_ZERO_ERROR;
                df->parse(codeMappingNumber,result,ps);
                if ( U_SUCCESS(ps) ) {
                    r->code = result.getLong(); // Convert string to number
                    uhash_iput(numericCodeMap,r->code,(void *)r,&status);
                }
                UnicodeString *code3 = new UnicodeString(codeMapping3Letter);
                uhash_put(regionAliases,(void *)code3, (void *)r,&status);
            }                    
        }
        ures_close(mapping);
    }

    // Now fill in the special cases for WORLD, UNKNOWN, CONTINENTS, and GROUPINGS
    Region *r;
    r = (Region *) uhash_get(regionIDMap,(void *)&WORLD_ID);
    if ( r ) {
        r->type = URGN_WORLD;
    }

    r = (Region *) uhash_get(regionIDMap,(void *)&UNKNOWN_REGION_ID);
    if ( r ) {
        r->type = URGN_UNKNOWN;
    }

    for ( int32_t i = 0 ; i < continents->size() ; i++ ) {
        r = (Region *) uhash_get(regionIDMap,(void *)continents->elementAt(i));
        if ( r ) {
            r->type = URGN_CONTINENT;
        }
    }
    delete continents;

    for ( int32_t i = 0 ; i < groupings->size() ; i++ ) {
        r = (Region *) uhash_get(regionIDMap,(void *)groupings->elementAt(i));
        if ( r ) {
            r->type = URGN_GROUPING;
        }
    }
    delete groupings;

    // Special case: The region code "QO" (Outlying Oceania) is a subcontinent code added by CLDR
    // even though it looks like a territory code.  Need to handle it here.

    r = (Region *) uhash_get(regionIDMap,(void *)&OUTLYING_OCEANIA_REGION_ID);
    if ( r ) {
        r->type = URGN_SUBCONTINENT;
    }

    // Load territory containment info from the supplemental data.
    while ( ures_hasNext(territoryContainment) ) {
        UResourceBundle *mapping = ures_getNextResource(territoryContainment,NULL,&status);
        const char *parent = ures_getKey(mapping);
        UnicodeString parentStr = UnicodeString(parent);
        Region *parentRegion = (Region *) uhash_get(regionIDMap,(void *)&parentStr);

        for ( int j = 0 ; j < ures_getSize(mapping); j++ ) {
            UnicodeString child = ures_getUnicodeStringByIndex(mapping,j,&status);
            Region *childRegion = (Region *) uhash_get(regionIDMap,(void *)&child);
            if ( parentRegion != NULL && childRegion != NULL ) {                    

                // Add the child region to the set of regions contained by the parent
                if (parentRegion->containedRegions == NULL) {
                    parentRegion->containedRegions = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status);
                }

                UnicodeString *childStr = new UnicodeString(status);
                childStr->fastCopyFrom(childRegion->idStr);
                parentRegion->containedRegions->addElement((void *)childStr,status);

                // Set the parent region to be the containing region of the child.
                // Regions of type GROUPING can't be set as the parent, since another region
                // such as a SUBCONTINENT, CONTINENT, or WORLD must always be the parent.
                if ( parentRegion->type != URGN_GROUPING) {
                    childRegion->containingRegion = parentRegion;
                }
            }
        }
        ures_close(mapping);
    }     

    // Create the availableRegions lists
    int32_t pos = -1;
    while ( const UHashElement* element = uhash_nextElement(regionIDMap,&pos)) {
        Region *ar = (Region *)element->value.pointer;
        if ( availableRegions[ar->type] == NULL ) {
            availableRegions[ar->type] = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status);
        }
        UnicodeString *arString = new UnicodeString(ar->idStr);
        availableRegions[ar->type]->addElement((void *)arString,status);
    }

    ures_close(territoryContainment);
    ures_close(worldContainment);
    ures_close(groupingContainment);

    ures_close(codeMappings);
    ures_close(rb2);
    ures_close(territoryAlias);
    ures_close(regionCodes);
    ures_close(rb);

    delete df;

    ucln_i18n_registerCleanup(UCLN_I18N_REGION, region_cleanup);

    regionDataIsLoaded = true;
    umtx_unlock(&gRegionDataLock);

}
Exemplo n.º 25
0
 virtual const UnicodeString* snext(UErrorCode& status) {
     if (upToDate(status) && (_pos < _ids.size())) {
         return (const UnicodeString*)_ids[_pos++];
     }
     return NULL;
 }
Exemplo n.º 26
0
static unsigned cookie_split(UVector<UString>& vec, const UString& buffer, const char* delim)
{
   U_TRACE(5, "cookie_split(%p,%.*S,%S)", &vec, U_STRING_TO_TRACE(buffer), delim)

   UString x;

   unsigned r, n   = vec.size();
   const char* s   = buffer.data();
   const char* ss  = s;
   const char* end = s + buffer.size();
   const char* p;
   const char* b = s;

loop:
   if (s >= end) goto done;

   if (strchr(delim, *s))
      {
      ++s; 

      goto loop;
      }
   else
      {
      while (isspace(*s)) s++;

      p = s++;

      if (*(s-1) == '"')
         {
         while (s < end && *s != '"') ++s;
         }

      ss = s;

      while (s < end && strchr(delim,*s) == 0)
         {
         ++s;

         if (!isspace(*(s-1))) ss = s;
         }
      }

   if (*p == '"' && *(ss-1) == '"')
      {
      p++;
      ss--;
      }

   x = buffer.substr(p - b, ss - p);

   vec.push_back(x);

   ++s;

   goto loop;

done:
   r = vec.size() - n;

#ifdef DEBUG
   for (unsigned i = 0; i < r; ++i)
      {
      U_DUMP("vec[%d] = %.*S", n+i, U_STRING_TO_TRACE(vec[n+i]))
      }
#endif

   U_RETURN(r);
}