void IntlTestSpoof::testSpoofAPI() { TEST_SETUP UnicodeString s("xyz"); // Many latin ranges are whole-script confusable with other scripts. // If this test starts failing, consult confusablesWholeScript.txt int32_t position = 666; int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); TEST_ASSERT_EQ(666, position); TEST_TEARDOWN; TEST_SETUP UnicodeString s1("cxs"); UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape(); // Cyrillic "cxs" int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status); TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults); TEST_TEARDOWN; TEST_SETUP UnicodeString s("I1l0O"); UnicodeString dest; UnicodeString &retStr = uspoof_getSkeletonUnicodeString(sc, USPOOF_ANY_CASE, s, dest, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(UnicodeString("lllOO") == dest); TEST_ASSERT(&dest == &retStr); TEST_TEARDOWN; }
U_CAPI int32_t U_EXPORT2 uspoof_getSkeletonUTF8(const USpoofChecker *sc, uint32_t type, const char *id, int32_t length, char *dest, int32_t destCapacity, UErrorCode *status) { SpoofImpl::validateThis(sc, *status); if (U_FAILURE(*status)) { return 0; } if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL)) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } UnicodeString srcStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : uprv_strlen(id))); UnicodeString destStr; uspoof_getSkeletonUnicodeString(sc, type, srcStr, destStr, status); if (U_FAILURE(*status)) { return 0; } int32_t lengthInUTF8 = 0; u_strToUTF8(dest, destCapacity, &lengthInUTF8, destStr.getBuffer(), destStr.length(), status); return lengthInUTF8; }
void IntlTestSpoof::testSpoofAPI() { TEST_SETUP UnicodeString s("uvw"); int32_t position = 666; int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT_EQ(0, checkResults); TEST_ASSERT_EQ(666, position); TEST_TEARDOWN; TEST_SETUP UnicodeString s1("cxs"); UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape(); // Cyrillic "cxs" int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status); TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults); TEST_TEARDOWN; TEST_SETUP UnicodeString s("I1l0O"); UnicodeString dest; UnicodeString &retStr = uspoof_getSkeletonUnicodeString(sc, USPOOF_ANY_CASE, s, dest, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(UnicodeString("11100") == dest); TEST_ASSERT(&dest == &retStr); TEST_TEARDOWN; }
// // Run a single confusable skeleton transformation test case. // void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type, const char *input, const char *expected, int32_t lineNum) { UnicodeString uInput = UnicodeString(input).unescape(); UnicodeString uExpected = UnicodeString(expected).unescape(); UErrorCode status = U_ZERO_ERROR; UnicodeString actual; uspoof_getSkeletonUnicodeString(sc, type, uInput, actual, &status); if (U_FAILURE(status)) { errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__, __LINE__, lineNum, u_errorName(status)); return; } if (uExpected != actual) { errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.", __FILE__, __LINE__, lineNum); errln(UnicodeString(" Actual Skeleton: \"") + actual + UnicodeString("\"\n") + UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeString("\"")); } }
U_CAPI int32_t U_EXPORT2 uspoof_getSkeleton(const USpoofChecker *sc, uint32_t type, const UChar *id, int32_t length, UChar *dest, int32_t destCapacity, UErrorCode *status) { SpoofImpl::validateThis(sc, *status); if (U_FAILURE(*status)) { return 0; } if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL)) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } UnicodeString idStr((length==-1), id, length); // Aliasing constructor UnicodeString destStr; uspoof_getSkeletonUnicodeString(sc, type, idStr, destStr, status); destStr.extract(dest, destCapacity, *status); return destStr.length(); }
// testConfData - Check each data item from the Unicode confusables.txt file, // verify that it transforms correctly in a skeleton. // void IntlTestSpoof::testConfData() { UErrorCode status = U_ZERO_ERROR; const char *testDataDir = IntlTest::getSourceTestData(status); TEST_ASSERT_SUCCESS(status); char buffer[2000]; uprv_strcpy(buffer, testDataDir); uprv_strcat(buffer, "confusables.txt"); LocalStdioFilePointer f(fopen(buffer, "rb")); if (f.isNull()) { errln("Skipping test spoof/testConfData. File confusables.txt not accessible."); return; } fseek(f.getAlias(), 0, SEEK_END); int32_t fileSize = ftell(f.getAlias()); LocalArray<char> fileBuf(new char[fileSize]); fseek(f.getAlias(), 0, SEEK_SET); int32_t amt_read = fread(fileBuf.getAlias(), 1, fileSize, f.getAlias()); TEST_ASSERT_EQ(amt_read, fileSize); TEST_ASSERT(fileSize>0); if (amt_read != fileSize || fileSize <=0) { return; } UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf.getAlias(), fileSize)); LocalUSpoofCheckerPointer sc(uspoof_open(&status)); TEST_ASSERT_SUCCESS(status); // Parse lines from the confusables.txt file. Example Line: // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH .... // Three fields. The hex fields can contain more than one character, // and each character may be more than 4 digits (for supplemntals) // This regular expression matches lines and splits the fields into capture groups. RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt, 0, status); TEST_ASSERT_SUCCESS(status); while (parseLine.find()) { UnicodeString from = parseHex(parseLine.group(1, status)); if (!Normalizer::isNormalized(from, UNORM_NFD, status)) { // The source character was not NFD. // Skip this case; the first step in obtaining a skeleton is to NFD the input, // so the mapping in this line of confusables.txt will never be applied. continue; } UnicodeString rawExpected = parseHex(parseLine.group(2, status)); UnicodeString expected; Normalizer::decompose(rawExpected, FALSE /*NFD*/, 0, expected, status); TEST_ASSERT_SUCCESS(status); int32_t skeletonType = 0; UnicodeString tableType = parseLine.group(3, status); TEST_ASSERT_SUCCESS(status); if (tableType.indexOf("SL") >= 0) { skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE; } else if (tableType.indexOf("SA") >= 0) { skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE; } else if (tableType.indexOf("ML") >= 0) { skeletonType = 0; } else if (tableType.indexOf("MA") >= 0) { skeletonType = USPOOF_ANY_CASE; } UnicodeString actual; uspoof_getSkeletonUnicodeString(sc.getAlias(), skeletonType, from, actual, &status); TEST_ASSERT_SUCCESS(status); TEST_ASSERT(actual == expected); if (actual != expected) { errln(parseLine.group(0, status)); UnicodeString line = "Actual: "; int i = 0; while (i < actual.length()) { appendHexUChar(line, actual.char32At(i)); i = actual.moveIndex32(i, 1); } errln(line); } if (U_FAILURE(status)) { break; } } }
U_CAPI int32_t U_EXPORT2 uspoof_areConfusableUnicodeString(const USpoofChecker *sc, const icu::UnicodeString &id1, const icu::UnicodeString &id2, UErrorCode *status) { const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); if (U_FAILURE(*status)) { return 0; } // // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable, // and for definitions of the types (single, whole, mixed-script) of confusables. // We only care about a few of the check flags. Ignore the others. // If no tests relavant to this function have been specified, return an error. // TODO: is this really the right thing to do? It's probably an error on the caller's part, // but logically we would just return 0 (no error). if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) { *status = U_INVALID_STATE_ERROR; return 0; } int32_t flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE; int32_t result = 0; IdentifierInfo *identifierInfo = This->getIdentifierInfo(*status); if (U_FAILURE(*status)) { return 0; } identifierInfo->setIdentifier(id1, *status); int32_t id1ScriptCount = identifierInfo->getScriptCount(); int32_t id1FirstScript = identifierInfo->getScripts()->nextSetBit(0); identifierInfo->setIdentifier(id2, *status); int32_t id2ScriptCount = identifierInfo->getScriptCount(); int32_t id2FirstScript = identifierInfo->getScripts()->nextSetBit(0); This->releaseIdentifierInfo(identifierInfo); identifierInfo = NULL; if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) { UnicodeString id1Skeleton; UnicodeString id2Skeleton; if (id1ScriptCount <= 1 && id2ScriptCount <= 1 && id1FirstScript == id2FirstScript) { flagsForSkeleton |= USPOOF_SINGLE_SCRIPT_CONFUSABLE; uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status); uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status); if (id1Skeleton == id2Skeleton) { result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE; } } } if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) { // If the two inputs are single script confusable they cannot also be // mixed or whole script confusable, according to the UAX39 definitions. // So we can skip those tests. return result; } // Two identifiers are whole script confusable if each is of a single script // and they are mixed script confusable. UBool possiblyWholeScriptConfusables = id1ScriptCount <= 1 && id2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE); // // Mixed Script Check // if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) || possiblyWholeScriptConfusables ) { // For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us // the mixed script table skeleton, which is what we want. // The Any Case / Lower Case bit in the skelton flags was set at the top of the function. UnicodeString id1Skeleton; UnicodeString id2Skeleton; flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE; uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status); uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status); if (id1Skeleton == id2Skeleton) { result |= USPOOF_MIXED_SCRIPT_CONFUSABLE; if (possiblyWholeScriptConfusables) { result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE; } } } return result; }
U_CAPI int32_t U_EXPORT2 uspoof_areConfusableUnicodeString(const USpoofChecker *sc, const icu::UnicodeString &id1, const icu::UnicodeString &id2, UErrorCode *status) { const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); if (U_FAILURE(*status)) { return 0; } // // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable, // and for definitions of the types (single, whole, mixed-script) of confusables. // We only care about a few of the check flags. Ignore the others. // If no tests relavant to this function have been specified, return an error. // TODO: is this really the right thing to do? It's probably an error on the caller's part, // but logically we would just return 0 (no error). if ((This->fChecks & USPOOF_CONFUSABLE) == 0) { *status = U_INVALID_STATE_ERROR; return 0; } // Compute the skeletons and check for confusability. UnicodeString id1Skeleton; uspoof_getSkeletonUnicodeString(sc, 0 /* deprecated */, id1, id1Skeleton, status); UnicodeString id2Skeleton; uspoof_getSkeletonUnicodeString(sc, 0 /* deprecated */, id2, id2Skeleton, status); if (U_FAILURE(*status)) { return 0; } if (id1Skeleton != id2Skeleton) { return 0; } // If we get here, the strings are confusable. Now we just need to set the flags for the appropriate classes // of confusables according to UTS 39 section 4. // Start by computing the resolved script sets of id1 and id2. ScriptSet id1RSS; This->getResolvedScriptSet(id1, id1RSS, *status); ScriptSet id2RSS; This->getResolvedScriptSet(id2, id2RSS, *status); // Turn on all applicable flags int32_t result = 0; if (id1RSS.intersects(id2RSS)) { result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE; } else { result |= USPOOF_MIXED_SCRIPT_CONFUSABLE; if (!id1RSS.isEmpty() && !id2RSS.isEmpty()) { result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE; } } // Turn off flags that the user doesn't want if ((This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) == 0) { result &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE; } if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) == 0) { result &= ~USPOOF_MIXED_SCRIPT_CONFUSABLE; } if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) == 0) { result &= ~USPOOF_WHOLE_SCRIPT_CONFUSABLE; } return result; }