void CasePropsBuilder::addClosureMapping(UChar32 src, UChar32 dest, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } if(beVerbose) { printf("add closure mapping U+%04lx->U+%04lx\n", (unsigned long)src, (unsigned long)dest); } uint32_t value=utrie2_get32(pTrie, src); if((value&UCASE_EXCEPTION)==0) { /* * decode value into p2 (enough for makeException() to work properly), * add the closure mapping, * and set the new exception for src */ value=makeExcProps(src, value, errorCode); utrie2_set32(pTrie, src, value, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "genprops error: unable to set case mapping values, code: %s\n", u_errorName(errorCode)); return; } } excProps[value>>UGENCASE_EXC_SHIFT]->closure.add(dest); }
/* get an existing Norm unit */ Norm *Normalizer2DataBuilder::getNorm(UChar32 c) { uint32_t i=utrie2_get32(normTrie, c); if(i==0) { return NULL; } return norms+i; }
//--------------------------------------------------------------------------------------- // // wholeScriptCheck() // // Input text is already normalized to NFD // Return the set of scripts, each of which can represent something that is // confusable with the input text. The script of the input text // is included; input consisting of characters from a single script will // always produce a result consisting of a set containing that script. // //--------------------------------------------------------------------------------------- void SpoofImpl::wholeScriptCheck( const UChar *text, int32_t length, ScriptSet *result, UErrorCode &status) const { int32_t inputIdx = 0; UChar32 c; UTrie2 *table = (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie; result->setAll(); while (inputIdx < length) { U16_NEXT(text, inputIdx, length, c); uint32_t index = utrie2_get32(table, c); if (index == 0) { // No confusables in another script for this char. // TODO: we should change the data to have sets with just the single script // bit for the script of this char. Gets rid of this special case. // Until then, grab the script from the char and intersect it with the set. UScriptCode cpScript = uscript_getScript(c, &status); U_ASSERT(cpScript > USCRIPT_INHERITED); result->intersect(cpScript); } else if (index == 1) { // Script == Common or Inherited. Nothing to do. } else { result->intersect(fSpoofData->fScriptSets[index]); } } }
/* * get or create a Norm unit; * get or create the intermediate trie entries for it as well */ Norm *Normalizer2DataBuilder::createNorm(UChar32 c) { uint32_t i=utrie2_get32(normTrie, c); if(i!=0) { return norms+i; } else { /* allocate Norm */ Norm *p=allocNorm(); IcuToolErrorCode errorCode("gennorm2/createNorm()"); utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode); return p; } }
/* * Find missing case mapping relationships and add mappings for case closure. * This function starts from an "original" code point and recursively * finds its case mappings and the case mappings of where it maps to. * * The recursion depth is capped at 3 nested calls of this function. * In each call, the current code point is c, and the function enumerates * all of c's simple (single-code point) case mappings. * prev is the code point that case-mapped to c. * prev2 is the code point that case-mapped to prev. * * The initial function call has prev2<0, prev<0, and c==orig * (marking no code points). * It enumerates c's case mappings and recurses without further action. * * The second-level function call has prev2<0, prev==orig, and c is * the destination code point of one of prev's case mappings. * The function checks if any of c's case mappings go back to orig * and adds a closure mapping if not. * In other words, it turns a case mapping relationship of * orig->c * into * orig<->c * * The third-level function call has prev2==orig, prev>=0, and c is * the destination code point of one of prev's case mappings. * (And prev is the destination of one of prev2's case mappings.) * The function checks if any of c's case mappings go back to orig * and adds a closure mapping if not. * In other words, it turns case mapping relationships of * orig->prev->c or orig->prev<->c * into * orig->prev->c->orig or orig->prev<->c->orig * etc. * (Graphically, this closes a triangle.) * * With repeated application on all code points until no more closure mappings * are added, all case equivalence groups get complete mappings. * That is, in each group of code points with case relationships * each code point will in the end have some mapping to each other * code point in the group. * * @return TRUE if a closure mapping was added */ UBool CasePropsBuilder::addClosure(UChar32 orig, UChar32 prev2, UChar32 prev, UChar32 c, uint32_t value, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return FALSE; } UChar32 next; UBool someMappingsAdded=FALSE; if(c!=orig) { /* get the properties for c */ value=utrie2_get32(pTrie, c); } /* else if c==orig then c's value was passed in */ if(value&UCASE_EXCEPTION) { UnicodeSet set; ExcProps &ep=*excProps[value>>UGENCASE_EXC_SHIFT]; UniProps &p=ep.props; /* * marker for whether any of c's mappings goes to orig * c==orig: prevent adding a closure mapping when getting orig's own, direct mappings */ UBool mapsToOrig=(UBool)(c==orig); /* collect c's case mapping destinations in set[] */ if((next=p.suc)>=0 && next!=c) { set.add(next); } if((next=p.slc)>=0 && next!=c) { set.add(next); } if(p.suc!=(next=p.stc) && next!=c) { set.add(next); } if((next=p.scf)>=0 && next!=c) { set.add(next); } /* add c's current closure mappings to set */ set.addAll(ep.closure); /* process all code points to which c case-maps */ UnicodeSetIterator iter(set); while(iter.next()) { next=iter.getCodepoint(); /* next!=c */ if(next==orig) { mapsToOrig=TRUE; /* remember that we map to orig */ } else if(prev2<0 && next!=prev) { /* * recurse unless * we have reached maximum depth (prev2>=0) or * this is a mapping to one of the previous code points (orig, prev, c) */ someMappingsAdded|=addClosure(orig, prev, c, next, 0, errorCode); } } if(!mapsToOrig) { addClosureMapping(c, orig, errorCode); return TRUE; } } else {
const Norm &Normalizer2DataBuilder::getNormRef(UChar32 c) const { return norms[utrie2_get32(normTrie, c)]; }
// Build the Whole Script Confusable data // // TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class, // because everything is local to this one build function anyhow, // OR // break this function into more reasonably sized pieces, with // state in WSConfusableDataBuilder. // void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) { if (U_FAILURE(status)) { return; } URegularExpression *parseRegexp = NULL; int32_t inputLen = 0; UChar *input = NULL; int32_t lineNum = 0; UVector *scriptSets = NULL; uint32_t rtScriptSetsCount = 2; UTrie2 *anyCaseTrie = NULL; UTrie2 *lowerCaseTrie = NULL; anyCaseTrie = utrie2_open(0, 0, &status); lowerCaseTrie = utrie2_open(0, 0, &status); // The scriptSets vector provides a mapping from TRIE values to the set of scripts. // // Reserved TRIE values: // 0: Code point has no whole script confusables. // 1: Code point is of script Common or Inherited. // These code points do not participate in whole script confusable detection. // (This is logically equivalent to saying that they contain confusables in // all scripts) // // Because Trie values are indexes into the ScriptSets vector, pre-fill // vector positions 0 and 1 to avoid conflicts with the reserved values. scriptSets = new UVector(status); if (scriptSets == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } scriptSets->addElement((void *)NULL, status); scriptSets->addElement((void *)NULL, status); // Convert the user input data from UTF-8 to UChar (UTF-16) u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { goto cleanup; } status = U_ZERO_ERROR; input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); if (input == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status); parseRegexp = uregex_openC(parseExp, 0, NULL, &status); // Zap any Byte Order Mark at the start of input. Changing it to a space is benign // given the syntax of the input. if (*input == 0xfeff) { *input = 0x20; } // Parse the input, one line per iteration of this loop. uregex_setText(parseRegexp, input, inputLen, &status); while (uregex_findNext(parseRegexp, &status)) { lineNum++; UChar line[200]; uregex_group(parseRegexp, 0, line, 200, &status); if (uregex_start(parseRegexp, 1, &status) >= 0) { // this was a blank or comment line. continue; } if (uregex_start(parseRegexp, 8, &status) >= 0) { // input file syntax error. status = U_PARSE_ERROR; goto cleanup; } if (U_FAILURE(status)) { goto cleanup; } // Pick up the start and optional range end code points from the parsed line. UChar32 startCodePoint = SpoofImpl::ScanHex( input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status); UChar32 endCodePoint = startCodePoint; if (uregex_start(parseRegexp, 3, &status) >=0) { endCodePoint = SpoofImpl::ScanHex( input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status); } // Extract the two script names from the source line. We need these in an 8 bit // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on // to the ICU u_getPropertyValueEnum() function. Ugh. char srcScriptName[20]; char targScriptName[20]; extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status); extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status); UScriptCode srcScript = static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName)); UScriptCode targScript = static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName)); if (U_FAILURE(status)) { goto cleanup; } if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) { status = U_INVALID_FORMAT_ERROR; goto cleanup; } // select the table - (A) any case or (L) lower case only UTrie2 *table = anyCaseTrie; if (uregex_start(parseRegexp, 7, &status) >= 0) { table = lowerCaseTrie; } // Build the set of scripts containing confusable characters for // the code point(s) specified in this input line. // Sanity check that the script of the source code point is the same // as the source script indicated in the input file. Failure of this check is // an error in the input file. // Include the source script in the set (needed for Mixed Script Confusable detection). // UChar32 cp; for (cp=startCodePoint; cp<=endCodePoint; cp++) { int32_t setIndex = utrie2_get32(table, cp); BuilderScriptSet *bsset = NULL; if (setIndex > 0) { U_ASSERT(setIndex < scriptSets->size()); bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex)); } else { bsset = new BuilderScriptSet(); if (bsset == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } bsset->codePoint = cp; bsset->trie = table; bsset->sset = new ScriptSet(); setIndex = scriptSets->size(); bsset->index = setIndex; bsset->rindex = 0; if (bsset->sset == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } scriptSets->addElement(bsset, status); utrie2_set32(table, cp, setIndex, &status); } bsset->sset->Union(targScript); bsset->sset->Union(srcScript); if (U_FAILURE(status)) { goto cleanup; } UScriptCode cpScript = uscript_getScript(cp, &status); if (cpScript != srcScript) { status = U_INVALID_FORMAT_ERROR; goto cleanup; } } } // Eliminate duplicate script sets. At this point we have a separate // script set for every code point that had data in the input file. // // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them // // printf("Number of scriptSets: %d\n", scriptSets->size()); { int32_t duplicateCount = 0; rtScriptSetsCount = 2; for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) { BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri)); if (outerSet->index != static_cast<uint32_t>(outeri)) { // This set was already identified as a duplicate. // It will not be allocated a position in the runtime array of ScriptSets. continue; } outerSet->rindex = rtScriptSetsCount++; for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) { BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri)); if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) { delete innerSet->sset; innerSet->scriptSetOwned = FALSE; innerSet->sset = outerSet->sset; innerSet->index = outeri; innerSet->rindex = outerSet->rindex; duplicateCount++; } // But this doesn't get all. We need to fix the TRIE. } } // printf("Number of distinct script sets: %d\n", rtScriptSetsCount); } // Update the Trie values to be reflect the run time script indexes (after duplicate merging). // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets // are unused, which is why the loop index starts at 2.) { for (int32_t i=2; i<scriptSets->size(); i++) { BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); if (bSet->rindex != (uint32_t)i) { utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status); } } } // For code points with script==Common or script==Inherited, // Set the reserved value of 1 into both Tries. These characters do not participate // in Whole Script Confusable detection; this reserved value is the means // by which they are detected. { UnicodeSet ignoreSet; ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); UnicodeSet inheritedSet; inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); ignoreSet.addAll(inheritedSet); for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) { UChar32 rangeStart = ignoreSet.getRangeStart(rn); UChar32 rangeEnd = ignoreSet.getRangeEnd(rn); utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); } } // Serialize the data to the Spoof Detector { utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status); int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status); // printf("Any case Trie size: %d\n", size); if (status != U_BUFFER_OVERFLOW_ERROR) { goto cleanup; } status = U_ZERO_ERROR; spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit; spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size; spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie; void *where = spImpl->fSpoofData->reserveSpace(size, status); utrie2_serialize(anyCaseTrie, where, size, &status); utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status); size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status); // printf("Lower case Trie size: %d\n", size); if (status != U_BUFFER_OVERFLOW_ERROR) { goto cleanup; } status = U_ZERO_ERROR; spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit; spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size; spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie; where = spImpl->fSpoofData->reserveSpace(size, status); utrie2_serialize(lowerCaseTrie, where, size, &status); spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit; spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount; ScriptSet *rtScriptSets = static_cast<ScriptSet *> (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status)); uint32_t rindex = 2; for (int32_t i=2; i<scriptSets->size(); i++) { BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); if (bSet->rindex < rindex) { // We have already copied this script set to the serialized data. continue; } U_ASSERT(rindex == bSet->rindex); rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits. rindex++; } } // Open new utrie2s from the serialized data. We don't want to keep the ones // we just built because we would then have two copies of the data, one internal to // the utries that we have already constructed, and one in the serialized data area. // An alternative would be to not pre-serialize the Trie data, but that makes the // spoof detector data different, depending on how the detector was constructed. // It's simpler to keep the data always the same. spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized( UTRIE2_16_VALUE_BITS, (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie, spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, NULL, &status); spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized( UTRIE2_16_VALUE_BITS, (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie, spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, NULL, &status); cleanup: if (U_FAILURE(status)) { pe->line = lineNum; } uregex_close(parseRegexp); uprv_free(input); int32_t i; for (i=0; i<scriptSets->size(); i++) { BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); delete bsset; } delete scriptSets; utrie2_close(anyCaseTrie); utrie2_close(lowerCaseTrie); return; }