// A port of hb_icu_script_to_script because harfbuzz on CrOS is built // without hb-icu. See http://crbug.com/356929 static inline hb_script_t ICUScriptToHBScript(UScriptCode script) { if (UNLIKELY(script == USCRIPT_INVALID_CODE)) return HB_SCRIPT_INVALID; return hb_script_from_string(uscript_getShortName(script), -1); }
Unicode() : script_(USCRIPT_CODE_LIMIT), general_category_(U_CHAR_CATEGORY_COUNT) { for (int i = 0; i < USCRIPT_CODE_LIMIT; ++ i) script_[i] = uscript_getShortName(static_cast<UScriptCode>(i)); for (int i = 0; i < U_CHAR_CATEGORY_COUNT; ++ i) general_category_[i] = u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, i, U_SHORT_PROPERTY_NAME); }
DictionaryMatcher * ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) { UErrorCode status = U_ZERO_ERROR; // open root from brkitr tree. UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); int32_t dictnlength = 0; const UChar *dictfname = ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status); if (U_FAILURE(status)) { ures_close(b); return NULL; } CharString dictnbuf; CharString ext; const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot if (extStart != NULL) { int32_t len = (int32_t)(extStart - dictfname); ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status); dictnlength = len; } dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status); ures_close(b); UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status); if (U_SUCCESS(status)) { // build trie const uint8_t *data = (const uint8_t *)udata_getMemory(file); const int32_t *indexes = (const int32_t *)data; const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; DictionaryMatcher *m = NULL; if (trieType == DictionaryData::TRIE_TYPE_BYTES) { const int32_t transform = indexes[DictionaryData::IX_TRANSFORM]; const char *characters = (const char *)(data + offset); m = new BytesDictionaryMatcher(characters, transform, file); } else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { const UChar *characters = (const UChar *)(data + offset); m = new UCharsDictionaryMatcher(characters, file); } if (m == NULL) { // no matcher exists to take ownership - either we are an invalid // type or memory allocation failed udata_close(file); } return m; } else if (dictfname != NULL) { // we don't have a dictionary matcher. // returning NULL here will cause us to fail to find a dictionary break engine, as expected status = U_ZERO_ERROR; return NULL; } return NULL; }
UnicodeString &ScriptSet::displayScripts(UnicodeString &dest) const { UBool firstTime = TRUE; for (int32_t i = nextSetBit(0); i >= 0; i = nextSetBit(i + 1)) { if (!firstTime) { dest.append((UChar)0x20); } firstTime = FALSE; const char *scriptName = uscript_getShortName((UScriptCode(i))); dest.append(UnicodeString(scriptName, -1, US_INV)); } return dest; }
Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const { if (source == targetScript || source == USCRIPT_INVALID_CODE) { return NULL; } Transliterator* t = NULL; { Mutex m(NULL); t = (Transliterator*) uhash_iget(cache, (int32_t) source); } if (t == NULL) { UErrorCode ec = U_ZERO_ERROR; UnicodeString sourceName(uscript_getShortName(source), -1, US_INV); UnicodeString id(sourceName); id.append(TARGET_SEP).append(target); t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); if (U_FAILURE(ec) || t == NULL) { delete t; // Try to pivot around Latin, our most common script id = sourceName; id.append(LATIN_PIVOT, -1).append(target); t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); if (U_FAILURE(ec) || t == NULL) { delete t; t = NULL; } } if (t != NULL) { Transliterator *rt = NULL; { Mutex m(NULL); rt = static_cast<Transliterator *> (uhash_iget(cache, (int32_t) source)); if (rt == NULL) { // Common case, no race to cache this new transliterator. uhash_iput(cache, (int32_t) source, t, &ec); } else { // Race case, some other thread beat us to caching this transliterator. Transliterator *temp = rt; rt = t; // Our newly created transliterator that lost the race & now needs deleting. t = temp; // The transliterator from the cache that we will return. } } delete rt; // will be non-null only in case of races. } } return t; }
static void scriptsToString(const UScriptCode scripts[], int32_t length, char s[]) { int32_t i; if(length == 0) { strcpy(s, "(no scripts)"); return; } s[0] = 0; for(i = 0; i < length; ++i) { if(i > 0) { strcat(s, " "); } strcat(s, uscript_getShortName(scripts[i])); } }
const CompactTrieDictionary * ICULanguageBreakFactory::loadDictionaryFor(UScriptCode script, int32_t /*breakType*/) { UErrorCode status = U_ZERO_ERROR; // Open root from brkitr tree. char dictnbuff[256]; char ext[4]={'\0'}; UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); b = ures_getByKeyWithFallback(b, uscript_getShortName(script), b, &status); int32_t dictnlength = 0; const UChar *dictfname = ures_getString(b, &dictnlength, &status); if (U_SUCCESS(status) && (size_t)dictnlength >= sizeof(dictnbuff)) { dictnlength = 0; status = U_BUFFER_OVERFLOW_ERROR; } if (U_SUCCESS(status) && dictfname) { UChar* extStart=u_strchr(dictfname, 0x002e); int len = 0; if(extStart!=NULL){ len = extStart-dictfname; u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff u_UCharsToChars(dictfname, dictnbuff, len); } dictnbuff[len]=0; // nul terminate } ures_close(b); UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext, dictnbuff, &status); if (U_SUCCESS(status)) { const CompactTrieDictionary *dict = new CompactTrieDictionary( file, status); if (U_SUCCESS(status) && dict == NULL) { status = U_MEMORY_ALLOCATION_ERROR; } if (U_FAILURE(status)) { delete dict; dict = NULL; } return dict; } else if (dictfname != NULL){ //create dummy dict if dictionary filename not valid UChar c = 0x0020; status = U_ZERO_ERROR; MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE); mtd->addWord(&c, 1, status, 1); return new CompactTrieDictionary(*mtd, status); } return NULL; }
void UnicodeTest::TestScriptMetadata() { IcuTestErrorCode errorCode(*this, "TestScriptMetadata()"); UnicodeSet rtl("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]", errorCode); // So far, sample characters are uppercase. // Georgian is special. UnicodeSet cased("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]", errorCode); for(int32_t sci = 0; sci < USCRIPT_CODE_LIMIT; ++sci) { UScriptCode sc = (UScriptCode)sci; // Run the test with -v to see which script has failures: // .../intltest$ make && ./intltest utility/UnicodeTest/TestScriptMetadata -v | grep -C 3 FAIL logln(uscript_getShortName(sc)); UScriptUsage usage = uscript_getUsage(sc); UnicodeString sample = uscript_getSampleUnicodeString(sc); UnicodeSet scriptSet; scriptSet.applyIntPropertyValue(UCHAR_SCRIPT, sc, errorCode); if(usage == USCRIPT_USAGE_NOT_ENCODED) { assertTrue("not encoded, no sample", sample.isEmpty()); assertFalse("not encoded, not RTL", uscript_isRightToLeft(sc)); assertFalse("not encoded, not LB letters", uscript_breaksBetweenLetters(sc)); assertFalse("not encoded, not cased", uscript_isCased(sc)); assertTrue("not encoded, no characters", scriptSet.isEmpty()); } else { assertFalse("encoded, has a sample character", sample.isEmpty()); UChar32 firstChar = sample.char32At(0); UScriptCode charScript = getCharScript(sc); assertEquals("script(sample(script))", charScript, uscript_getScript(firstChar, errorCode)); assertEquals("RTL vs. set", rtl.contains(firstChar), uscript_isRightToLeft(sc)); assertEquals("cased vs. set", cased.contains(firstChar), uscript_isCased(sc)); assertEquals("encoded, has characters", sc == charScript, !scriptSet.isEmpty()); if(uscript_isRightToLeft(sc)) { rtl.removeAll(scriptSet); } if(uscript_isCased(sc)) { cased.removeAll(scriptSet); } } } UnicodeString pattern; assertEquals("no remaining RTL characters", UnicodeString("[]"), rtl.toPattern(pattern)); assertEquals("no remaining cased characters", UnicodeString("[]"), cased.toPattern(pattern)); assertTrue("Hani breaks between letters", uscript_breaksBetweenLetters(USCRIPT_HAN)); assertTrue("Thai breaks between letters", uscript_breaksBetweenLetters(USCRIPT_THAI)); assertFalse("Latn does not break between letters", uscript_breaksBetweenLetters(USCRIPT_LATIN)); }
static inline hb_script_t _icu_script_to_script(UScriptCode script) { if (script == USCRIPT_INVALID_CODE) return HB_SCRIPT_INVALID; return hb_script_from_string(uscript_getShortName(script), -1); }
void TestUScriptCodeAPI(){ int i =0; int numErrors =0; { const char* testNames[]={ /* test locale */ "en", "en_US", "sr", "ta" , "te_IN", "hi", "he", "ar", /* test abbr */ "Hani", "Hang","Hebr","Hira", "Knda","Kana","Khmr","Lao", "Latn",/*"Latf","Latg",*/ "Mlym", "Mong", /* test names */ "CYRILLIC","DESERET","DEVANAGARI","ETHIOPIC","GEORGIAN", "GOTHIC", "GREEK", "GUJARATI", "COMMON", "INHERITED", /* test lower case names */ "malayalam", "mongolian", "myanmar", "ogham", "old-italic", "oriya", "runic", "sinhala", "syriac","tamil", "telugu", "thaana", "thai", "tibetan", /* test the bounds*/ "tagb", "arabic", /* test bogus */ "asfdasd", "5464", "12235", /* test the last index */ "zyyy", "YI", '\0' }; UScriptCode expected[] ={ /* locales should return */ USCRIPT_LATIN, USCRIPT_LATIN, USCRIPT_CYRILLIC, USCRIPT_TAMIL, USCRIPT_TELUGU, USCRIPT_DEVANAGARI, USCRIPT_HEBREW, USCRIPT_ARABIC, /* abbr should return */ USCRIPT_HAN, USCRIPT_HANGUL, USCRIPT_HEBREW, USCRIPT_HIRAGANA, USCRIPT_KANNADA, USCRIPT_KATAKANA, USCRIPT_KHMER, USCRIPT_LAO, USCRIPT_LATIN,/* USCRIPT_LATIN, USCRIPT_LATIN,*/ USCRIPT_MALAYALAM, USCRIPT_MONGOLIAN, /* names should return */ USCRIPT_CYRILLIC, USCRIPT_DESERET, USCRIPT_DEVANAGARI, USCRIPT_ETHIOPIC, USCRIPT_GEORGIAN, USCRIPT_GOTHIC, USCRIPT_GREEK, USCRIPT_GUJARATI, USCRIPT_COMMON, USCRIPT_INHERITED, /* lower case names should return */ USCRIPT_MALAYALAM, USCRIPT_MONGOLIAN, USCRIPT_MYANMAR, USCRIPT_OGHAM, USCRIPT_OLD_ITALIC, USCRIPT_ORIYA, USCRIPT_RUNIC, USCRIPT_SINHALA, USCRIPT_SYRIAC, USCRIPT_TAMIL, USCRIPT_TELUGU, USCRIPT_THAANA, USCRIPT_THAI, USCRIPT_TIBETAN, /* bounds */ USCRIPT_TAGBANWA, USCRIPT_ARABIC, /* bogus names should return invalid code */ USCRIPT_INVALID_CODE, USCRIPT_INVALID_CODE, USCRIPT_INVALID_CODE, USCRIPT_COMMON, USCRIPT_YI, }; UErrorCode err = U_ZERO_ERROR; const int32_t capacity = 10; for( ; testNames[i]!='\0'; i++){ UScriptCode script[10]={USCRIPT_INVALID_CODE}; uscript_getCode(testNames[i],script,capacity, &err); if( script[0] != expected[i]){ log_data_err("Error getting script code Got: %i Expected: %i for name %s (Error code does not propagate if data is not present. Are you missing data?)\n", script[0],expected[i],testNames[i]); numErrors++; } } if(numErrors >0 ){ log_data_err("Errors uchar_getScriptCode() : %i \n",numErrors); } } { UErrorCode err = U_ZERO_ERROR; int32_t capacity=0; int32_t j; UScriptCode jaCode[]={USCRIPT_KATAKANA, USCRIPT_HIRAGANA, USCRIPT_HAN }; UScriptCode script[10]={USCRIPT_INVALID_CODE}; int32_t num = uscript_getCode("ja",script,capacity, &err); /* preflight */ if(err==U_BUFFER_OVERFLOW_ERROR){ err = U_ZERO_ERROR; capacity = 10; num = uscript_getCode("ja",script,capacity, &err); if(num!=(sizeof(jaCode)/sizeof(UScriptCode))){ log_err("Errors uscript_getScriptCode() for Japanese locale: num=%d, expected %d \n", num, (sizeof(jaCode)/sizeof(UScriptCode))); } for(j=0;j<sizeof(jaCode)/sizeof(UScriptCode);j++) { if(script[j]!=jaCode[j]) { log_err("Japanese locale: code #%d was %d (%s) but expected %d (%s)\n", j, script[j], uscript_getName(script[j]), jaCode[j], uscript_getName(jaCode[j])); } } }else{ log_data_err("Errors in uscript_getScriptCode() expected error : %s got: %s \n", "U_BUFFER_OVERFLOW_ERROR", u_errorName(err)); } } { UScriptCode testAbbr[]={ /* names should return */ USCRIPT_CYRILLIC, USCRIPT_DESERET, USCRIPT_DEVANAGARI, USCRIPT_ETHIOPIC, USCRIPT_GEORGIAN, USCRIPT_GOTHIC, USCRIPT_GREEK, USCRIPT_GUJARATI, }; const char* expectedNames[]={ /* test names */ "Cyrillic","Deseret","Devanagari","Ethiopic","Georgian", "Gothic", "Greek", "Gujarati", '\0' }; i=0; while(i<sizeof(testAbbr)/sizeof(UScriptCode)){ const char* name = uscript_getName(testAbbr[i]); if(name == NULL) { log_data_err("Couldn't get script name\n"); return; } numErrors=0; if(strcmp(expectedNames[i],name)!=0){ log_err("Error getting abbreviations Got: %s Expected: %s\n",name,expectedNames[i]); numErrors++; } if(numErrors > 0){ if(numErrors >0 ){ log_err("Errors uchar_getScriptAbbr() : %i \n",numErrors); } } i++; } } { UScriptCode testAbbr[]={ /* abbr should return */ USCRIPT_HAN, USCRIPT_HANGUL, USCRIPT_HEBREW, USCRIPT_HIRAGANA, USCRIPT_KANNADA, USCRIPT_KATAKANA, USCRIPT_KHMER, USCRIPT_LAO, USCRIPT_LATIN, USCRIPT_MALAYALAM, USCRIPT_MONGOLIAN, }; const char* expectedAbbr[]={ /* test abbr */ "Hani", "Hang","Hebr","Hira", "Knda","Kana","Khmr","Laoo", "Latn", "Mlym", "Mong", '\0' }; i=0; while(i<sizeof(testAbbr)/sizeof(UScriptCode)){ const char* name = uscript_getShortName(testAbbr[i]); numErrors=0; if(strcmp(expectedAbbr[i],name)!=0){ log_err("Error getting abbreviations Got: %s Expected: %s\n",name,expectedAbbr[i]); numErrors++; } if(numErrors > 0){ if(numErrors >0 ){ log_err("Errors uchar_getScriptAbbr() : %i \n",numErrors); } } i++; } } /* now test uscript_getScript() API */ { uint32_t codepoints[] = { 0x0000FF9D, /* USCRIPT_KATAKANA*/ 0x0000FFBE, /* USCRIPT_HANGUL*/ 0x0000FFC7, /* USCRIPT_HANGUL*/ 0x0000FFCF, /* USCRIPT_HANGUL*/ 0x0000FFD7, /* USCRIPT_HANGUL*/ 0x0000FFDC, /* USCRIPT_HANGUL*/ 0x00010300, /* USCRIPT_OLD_ITALIC*/ 0x00010330, /* USCRIPT_GOTHIC*/ 0x0001034A, /* USCRIPT_GOTHIC*/ 0x00010400, /* USCRIPT_DESERET*/ 0x00010428, /* USCRIPT_DESERET*/ 0x0001D167, /* USCRIPT_INHERITED*/ 0x0001D17B, /* USCRIPT_INHERITED*/ 0x0001D185, /* USCRIPT_INHERITED*/ 0x0001D1AA, /* USCRIPT_INHERITED*/ 0x00020000, /* USCRIPT_HAN*/ 0x00000D02, /* USCRIPT_MALAYALAM*/ 0x00000D00, /* USCRIPT_UNKNOWN (new Zzzz value in Unicode 5.0) */ 0x00000000, /* USCRIPT_COMMON*/ 0x0001D169, /* USCRIPT_INHERITED*/ 0x0001D182, /* USCRIPT_INHERITED*/ 0x0001D18B, /* USCRIPT_INHERITED*/ 0x0001D1AD, /* USCRIPT_INHERITED*/ }; UScriptCode expected[] = { USCRIPT_KATAKANA , USCRIPT_HANGUL , USCRIPT_HANGUL , USCRIPT_HANGUL , USCRIPT_HANGUL , USCRIPT_HANGUL , USCRIPT_OLD_ITALIC, USCRIPT_GOTHIC , USCRIPT_GOTHIC , USCRIPT_DESERET , USCRIPT_DESERET , USCRIPT_INHERITED, USCRIPT_INHERITED, USCRIPT_INHERITED, USCRIPT_INHERITED, USCRIPT_HAN , USCRIPT_MALAYALAM, USCRIPT_UNKNOWN, USCRIPT_COMMON, USCRIPT_INHERITED , USCRIPT_INHERITED , USCRIPT_INHERITED , USCRIPT_INHERITED , }; UScriptCode code = USCRIPT_INVALID_CODE; UErrorCode status = U_ZERO_ERROR; UBool passed = TRUE; for(i=0; i<LENGTHOF(codepoints); ++i){ code = uscript_getScript(codepoints[i],&status); if(U_SUCCESS(status)){ if( code != expected[i] || code != (UScriptCode)u_getIntPropertyValue(codepoints[i], UCHAR_SCRIPT) ) { log_err("uscript_getScript for codepoint \\U%08X failed\n",codepoints[i]); passed = FALSE; } }else{ log_err("uscript_getScript for codepoint \\U%08X failed. Error: %s\n", codepoints[i],u_errorName(status)); break; } } if(passed==FALSE){ log_err("uscript_getScript failed.\n"); } } { UScriptCode code= USCRIPT_INVALID_CODE; UErrorCode status = U_ZERO_ERROR; code = uscript_getScript(0x001D169,&status); if(code != USCRIPT_INHERITED){ log_err("\\U001D169 is not contained in USCRIPT_INHERITED"); } } { UScriptCode code= USCRIPT_INVALID_CODE; UErrorCode status = U_ZERO_ERROR; int32_t err = 0; for(i = 0; i<=0x10ffff; i++){ code = uscript_getScript(i,&status); if(code == USCRIPT_INVALID_CODE){ err++; log_err("uscript_getScript for codepoint \\U%08X failed.\n", i); } } if(err>0){ log_err("uscript_getScript failed for %d codepoints\n", err); } } { for(i=0; (UScriptCode)i< USCRIPT_CODE_LIMIT; i++){ const char* name = uscript_getName((UScriptCode)i); if(name==NULL || strcmp(name,"")==0){ log_err("uscript_getName failed for code %i: name is NULL or \"\"\n",i); } } } { /* * These script codes were originally added to ICU pre-3.6, so that ICU would * have all ISO 15924 script codes. ICU was then based on Unicode 4.1. * These script codes were added with only short names because we don't * want to invent long names ourselves. * Unicode 5 and later encode some of these scripts and give them long names. * Whenever this happens, the long script names here need to be updated. */ static const char* expectedLong[] = { "Balinese", "Batk", "Blis", "Brah", "Cham", "Cirt", "Cyrs", "Egyd", "Egyh", "Egyp", "Geok", "Hans", "Hant", "Hmng", "Hung", "Inds", "Java", "Kayah_Li", "Latf", "Latg", "Lepcha", "Lina", "Mand", "Maya", "Mero", "Nko", "Orkh", "Perm", "Phags_Pa", "Phoenician", "Plrd", "Roro", "Sara", "Syre", "Syrj", "Syrn", "Teng", "Vai", "Visp", "Cuneiform", "Zxxx", "Unknown", "Carian", "Jpan", "Lana", "Lycian", "Lydian", "Ol_Chiki", "Rejang", "Saurashtra", "Sgnw", "Sundanese", "Moon", "Mtei", /* new in ICU 4.0 */ "Armi", "Avst", "Cakm", "Kore", "Kthi", "Mani", "Phli", "Phlp", "Phlv", "Prti", "Samr", "Tavt", "Zmth", "Zsym", }; static const char* expectedShort[] = { "Bali", "Batk", "Blis", "Brah", "Cham", "Cirt", "Cyrs", "Egyd", "Egyh", "Egyp", "Geok", "Hans", "Hant", "Hmng", "Hung", "Inds", "Java", "Kali", "Latf", "Latg", "Lepc", "Lina", "Mand", "Maya", "Mero", "Nkoo", "Orkh", "Perm", "Phag", "Phnx", "Plrd", "Roro", "Sara", "Syre", "Syrj", "Syrn", "Teng", "Vaii", "Visp", "Xsux", "Zxxx", "Zzzz", "Cari", "Jpan", "Lana", "Lyci", "Lydi", "Olck", "Rjng", "Saur", "Sgnw", "Sund", "Moon", "Mtei", /* new in ICU 4.0 */ "Armi", "Avst", "Cakm", "Kore", "Kthi", "Mani", "Phli", "Phlp", "Phlv", "Prti", "Samr", "Tavt", "Zmth", "Zsym", }; int32_t j = 0; for(i=USCRIPT_BALINESE; (UScriptCode)i<USCRIPT_CODE_LIMIT; i++, j++){ const char* name = uscript_getName((UScriptCode)i); if(name==NULL || strcmp(name,expectedLong[j])!=0){ log_err("uscript_getName failed for code %i: %s!=%s\n", i, name, expectedLong[j]); } name = uscript_getShortName((UScriptCode)i); if(name==NULL || strcmp(name,expectedShort[j])!=0){ log_err("uscript_getShortName failed for code %i: %s!=%s\n", i, name, expectedShort[j]); } } for(i=0; i<LENGTHOF(expectedLong); i++){ UScriptCode fillIn[5] = {USCRIPT_INVALID_CODE}; UErrorCode status = U_ZERO_ERROR; int32_t len = 0; len = uscript_getCode(expectedShort[i], fillIn, LENGTHOF(fillIn), &status); if(U_FAILURE(status)){ log_err("uscript_getCode failed for script name %s. Error: %s\n",expectedShort[i], u_errorName(status)); } if(len>1){ log_err("uscript_getCode did not return expected number of codes for script %s. EXPECTED: 1 GOT: %i\n", expectedShort[i], len); } if(fillIn[0]!= (UScriptCode)(USCRIPT_BALINESE+i)){ log_err("uscript_getCode did not return expected code for script %s. EXPECTED: %i GOT: %i\n", expectedShort[i], (USCRIPT_BALINESE+i), fillIn[0] ); } } } }
void TestUScriptCodeAPI(){ int i =0; int numErrors =0; { const char* testNames[]={ /* test locale */ "en", "en_US", "sr", "ta" , "te_IN", "hi", "he", "ar", /* test abbr */ "Hani", "Hang","Hebr","Hira", "Knda","Kana","Khmr","Lao", "Latn",/*"Latf","Latg",*/ "Mlym", "Mong", /* test names */ "CYRILLIC","DESERET","DEVANAGARI","ETHIOPIC","GEORGIAN", "GOTHIC", "GREEK", "GUJARATI", "COMMON", "INHERITED", /* test lower case names */ "malayalam", "mongolian", "myanmar", "ogham", "old-italic", "oriya", "runic", "sinhala", "syriac","tamil", "telugu", "thaana", "thai", "tibetan", /* test the bounds*/ "tagb", "arabic", /* test bogus */ "asfdasd", "5464", "12235", /* test the last index */ "zyyy", "YI", NULL }; UScriptCode expected[] ={ /* locales should return */ USCRIPT_LATIN, USCRIPT_LATIN, USCRIPT_CYRILLIC, USCRIPT_TAMIL, USCRIPT_TELUGU, USCRIPT_DEVANAGARI, USCRIPT_HEBREW, USCRIPT_ARABIC, /* abbr should return */ USCRIPT_HAN, USCRIPT_HANGUL, USCRIPT_HEBREW, USCRIPT_HIRAGANA, USCRIPT_KANNADA, USCRIPT_KATAKANA, USCRIPT_KHMER, USCRIPT_LAO, USCRIPT_LATIN,/* USCRIPT_LATIN, USCRIPT_LATIN,*/ USCRIPT_MALAYALAM, USCRIPT_MONGOLIAN, /* names should return */ USCRIPT_CYRILLIC, USCRIPT_DESERET, USCRIPT_DEVANAGARI, USCRIPT_ETHIOPIC, USCRIPT_GEORGIAN, USCRIPT_GOTHIC, USCRIPT_GREEK, USCRIPT_GUJARATI, USCRIPT_COMMON, USCRIPT_INHERITED, /* lower case names should return */ USCRIPT_MALAYALAM, USCRIPT_MONGOLIAN, USCRIPT_MYANMAR, USCRIPT_OGHAM, USCRIPT_OLD_ITALIC, USCRIPT_ORIYA, USCRIPT_RUNIC, USCRIPT_SINHALA, USCRIPT_SYRIAC, USCRIPT_TAMIL, USCRIPT_TELUGU, USCRIPT_THAANA, USCRIPT_THAI, USCRIPT_TIBETAN, /* bounds */ USCRIPT_TAGBANWA, USCRIPT_ARABIC, /* bogus names should return invalid code */ USCRIPT_INVALID_CODE, USCRIPT_INVALID_CODE, USCRIPT_INVALID_CODE, USCRIPT_COMMON, USCRIPT_YI, }; UErrorCode err = U_ZERO_ERROR; const int32_t capacity = 10; for( ; testNames[i]!=NULL; i++){ UScriptCode script[10]={USCRIPT_INVALID_CODE}; uscript_getCode(testNames[i],script,capacity, &err); if( script[0] != expected[i]){ log_data_err("Error getting script code Got: %i Expected: %i for name %s (Error code does not propagate if data is not present. Are you missing data?)\n", script[0],expected[i],testNames[i]); numErrors++; } } if(numErrors >0 ){ log_data_err("Errors uchar_getScriptCode() : %i \n",numErrors); } } { UErrorCode err = U_ZERO_ERROR; int32_t capacity=0; int32_t j; UScriptCode jaCode[]={USCRIPT_KATAKANA, USCRIPT_HIRAGANA, USCRIPT_HAN }; UScriptCode script[10]={USCRIPT_INVALID_CODE}; int32_t num = uscript_getCode("ja",script,capacity, &err); /* preflight */ if(err==U_BUFFER_OVERFLOW_ERROR){ err = U_ZERO_ERROR; capacity = 10; num = uscript_getCode("ja",script,capacity, &err); if(num!=UPRV_LENGTHOF(jaCode)){ log_err("Errors uscript_getScriptCode() for Japanese locale: num=%d, expected %d \n", num, UPRV_LENGTHOF(jaCode)); } for(j=0;j<UPRV_LENGTHOF(jaCode);j++) { if(script[j]!=jaCode[j]) { log_err("Japanese locale: code #%d was %d (%s) but expected %d (%s)\n", j, script[j], uscript_getName(script[j]), jaCode[j], uscript_getName(jaCode[j])); } } }else{ log_data_err("Errors in uscript_getScriptCode() expected error : %s got: %s \n", "U_BUFFER_OVERFLOW_ERROR", u_errorName(err)); } } { static const UScriptCode LATIN[1] = { USCRIPT_LATIN }; static const UScriptCode CYRILLIC[1] = { USCRIPT_CYRILLIC }; static const UScriptCode DEVANAGARI[1] = { USCRIPT_DEVANAGARI }; static const UScriptCode HAN[1] = { USCRIPT_HAN }; static const UScriptCode JAPANESE[3] = { USCRIPT_KATAKANA, USCRIPT_HIRAGANA, USCRIPT_HAN }; static const UScriptCode KOREAN[2] = { USCRIPT_HANGUL, USCRIPT_HAN }; static const UScriptCode HAN_BOPO[2] = { USCRIPT_HAN, USCRIPT_BOPOMOFO }; UScriptCode scripts[5]; UErrorCode err; int32_t num; // Should work regardless of whether we have locale data for the language. err = U_ZERO_ERROR; num = uscript_getCode("tg", scripts, UPRV_LENGTHOF(scripts), &err); assertEqualScripts("tg script: Cyrl", CYRILLIC, 1, scripts, num, err); // Tajik err = U_ZERO_ERROR; num = uscript_getCode("xsr", scripts, UPRV_LENGTHOF(scripts), &err); assertEqualScripts("xsr script: Deva", DEVANAGARI, 1, scripts, num, err); // Sherpa // Multi-script languages. err = U_ZERO_ERROR; num = uscript_getCode("ja", scripts, UPRV_LENGTHOF(scripts), &err); assertEqualScripts("ja scripts: Kana Hira Hani", JAPANESE, UPRV_LENGTHOF(JAPANESE), scripts, num, err); err = U_ZERO_ERROR; num = uscript_getCode("ko", scripts, UPRV_LENGTHOF(scripts), &err); assertEqualScripts("ko scripts: Hang Hani", KOREAN, UPRV_LENGTHOF(KOREAN), scripts, num, err); err = U_ZERO_ERROR; num = uscript_getCode("zh", scripts, UPRV_LENGTHOF(scripts), &err); assertEqualScripts("zh script: Hani", HAN, 1, scripts, num, err); err = U_ZERO_ERROR; num = uscript_getCode("zh-Hant", scripts, UPRV_LENGTHOF(scripts), &err); assertEqualScripts("zh-Hant scripts: Hani Bopo", HAN_BOPO, 2, scripts, num, err); err = U_ZERO_ERROR; num = uscript_getCode("zh-TW", scripts, UPRV_LENGTHOF(scripts), &err); assertEqualScripts("zh-TW scripts: Hani Bopo", HAN_BOPO, 2, scripts, num, err); // Ambiguous API, but this probably wants to return Latin rather than Rongorongo (Roro). err = U_ZERO_ERROR; num = uscript_getCode("ro-RO", scripts, UPRV_LENGTHOF(scripts), &err); assertEqualScripts("ro-RO script: Latn", LATIN, 1, scripts, num, err); } { UScriptCode testAbbr[]={ /* names should return */ USCRIPT_CYRILLIC, USCRIPT_DESERET, USCRIPT_DEVANAGARI, USCRIPT_ETHIOPIC, USCRIPT_GEORGIAN, USCRIPT_GOTHIC, USCRIPT_GREEK, USCRIPT_GUJARATI, }; const char* expectedNames[]={ /* test names */ "Cyrillic","Deseret","Devanagari","Ethiopic","Georgian", "Gothic", "Greek", "Gujarati", NULL }; i=0; while(i<UPRV_LENGTHOF(testAbbr)){ const char* name = uscript_getName(testAbbr[i]); if(name == NULL) { log_data_err("Couldn't get script name\n"); return; } numErrors=0; if(strcmp(expectedNames[i],name)!=0){ log_err("Error getting abbreviations Got: %s Expected: %s\n",name,expectedNames[i]); numErrors++; } if(numErrors > 0){ if(numErrors >0 ){ log_err("Errors uchar_getScriptAbbr() : %i \n",numErrors); } } i++; } } { UScriptCode testAbbr[]={ /* abbr should return */ USCRIPT_HAN, USCRIPT_HANGUL, USCRIPT_HEBREW, USCRIPT_HIRAGANA, USCRIPT_KANNADA, USCRIPT_KATAKANA, USCRIPT_KHMER, USCRIPT_LAO, USCRIPT_LATIN, USCRIPT_MALAYALAM, USCRIPT_MONGOLIAN, }; const char* expectedAbbr[]={ /* test abbr */ "Hani", "Hang","Hebr","Hira", "Knda","Kana","Khmr","Laoo", "Latn", "Mlym", "Mong", NULL }; i=0; while(i<UPRV_LENGTHOF(testAbbr)){ const char* name = uscript_getShortName(testAbbr[i]); numErrors=0; if(strcmp(expectedAbbr[i],name)!=0){ log_err("Error getting abbreviations Got: %s Expected: %s\n",name,expectedAbbr[i]); numErrors++; } if(numErrors > 0){ if(numErrors >0 ){ log_err("Errors uchar_getScriptAbbr() : %i \n",numErrors); } } i++; } } /* now test uscript_getScript() API */ { uint32_t codepoints[] = { 0x0000FF9D, /* USCRIPT_KATAKANA*/ 0x0000FFBE, /* USCRIPT_HANGUL*/ 0x0000FFC7, /* USCRIPT_HANGUL*/ 0x0000FFCF, /* USCRIPT_HANGUL*/ 0x0000FFD7, /* USCRIPT_HANGUL*/ 0x0000FFDC, /* USCRIPT_HANGUL*/ 0x00010300, /* USCRIPT_OLD_ITALIC*/ 0x00010330, /* USCRIPT_GOTHIC*/ 0x0001034A, /* USCRIPT_GOTHIC*/ 0x00010400, /* USCRIPT_DESERET*/ 0x00010428, /* USCRIPT_DESERET*/ 0x0001D167, /* USCRIPT_INHERITED*/ 0x0001D17B, /* USCRIPT_INHERITED*/ 0x0001D185, /* USCRIPT_INHERITED*/ 0x0001D1AA, /* USCRIPT_INHERITED*/ 0x00020000, /* USCRIPT_HAN*/ 0x00000D02, /* USCRIPT_MALAYALAM*/ 0x00000D00, /* USCRIPT_UNKNOWN (new Zzzz value in Unicode 5.0) */ 0x00000000, /* USCRIPT_COMMON*/ 0x0001D169, /* USCRIPT_INHERITED*/ 0x0001D182, /* USCRIPT_INHERITED*/ 0x0001D18B, /* USCRIPT_INHERITED*/ 0x0001D1AD, /* USCRIPT_INHERITED*/ }; UScriptCode expected[] = { USCRIPT_KATAKANA , USCRIPT_HANGUL , USCRIPT_HANGUL , USCRIPT_HANGUL , USCRIPT_HANGUL , USCRIPT_HANGUL , USCRIPT_OLD_ITALIC, USCRIPT_GOTHIC , USCRIPT_GOTHIC , USCRIPT_DESERET , USCRIPT_DESERET , USCRIPT_INHERITED, USCRIPT_INHERITED, USCRIPT_INHERITED, USCRIPT_INHERITED, USCRIPT_HAN , USCRIPT_MALAYALAM, USCRIPT_UNKNOWN, USCRIPT_COMMON, USCRIPT_INHERITED , USCRIPT_INHERITED , USCRIPT_INHERITED , USCRIPT_INHERITED , }; UScriptCode code = USCRIPT_INVALID_CODE; UErrorCode status = U_ZERO_ERROR; UBool passed = TRUE; for(i=0; i<UPRV_LENGTHOF(codepoints); ++i){ code = uscript_getScript(codepoints[i],&status); if(U_SUCCESS(status)){ if( code != expected[i] || code != (UScriptCode)u_getIntPropertyValue(codepoints[i], UCHAR_SCRIPT) ) { log_err("uscript_getScript for codepoint \\U%08X failed\n",codepoints[i]); passed = FALSE; } }else{ log_err("uscript_getScript for codepoint \\U%08X failed. Error: %s\n", codepoints[i],u_errorName(status)); break; } } if(passed==FALSE){ log_err("uscript_getScript failed.\n"); } } { UScriptCode code= USCRIPT_INVALID_CODE; UErrorCode status = U_ZERO_ERROR; code = uscript_getScript(0x001D169,&status); if(code != USCRIPT_INHERITED){ log_err("\\U001D169 is not contained in USCRIPT_INHERITED"); } } { UScriptCode code= USCRIPT_INVALID_CODE; UErrorCode status = U_ZERO_ERROR; int32_t err = 0; for(i = 0; i<=0x10ffff; i++){ code = uscript_getScript(i,&status); if(code == USCRIPT_INVALID_CODE){ err++; log_err("uscript_getScript for codepoint \\U%08X failed.\n", i); } } if(err>0){ log_err("uscript_getScript failed for %d codepoints\n", err); } } { for(i=0; (UScriptCode)i< USCRIPT_CODE_LIMIT; i++){ const char* name = uscript_getName((UScriptCode)i); if(name==NULL || strcmp(name,"")==0){ log_err("uscript_getName failed for code %i: name is NULL or \"\"\n",i); } } } { /* * These script codes were originally added to ICU pre-3.6, so that ICU would * have all ISO 15924 script codes. ICU was then based on Unicode 4.1. * These script codes were added with only short names because we don't * want to invent long names ourselves. * Unicode 5 and later encode some of these scripts and give them long names. * Whenever this happens, the long script names here need to be updated. */ static const char* expectedLong[] = { "Balinese", "Batak", "Blis", "Brahmi", "Cham", "Cirt", "Cyrs", "Egyd", "Egyh", "Egyptian_Hieroglyphs", "Geok", "Hans", "Hant", "Pahawh_Hmong", "Old_Hungarian", "Inds", "Javanese", "Kayah_Li", "Latf", "Latg", "Lepcha", "Linear_A", "Mandaic", "Maya", "Meroitic_Hieroglyphs", "Nko", "Old_Turkic", "Old_Permic", "Phags_Pa", "Phoenician", "Miao", "Roro", "Sara", "Syre", "Syrj", "Syrn", "Teng", "Vai", "Visp", "Cuneiform", "Zxxx", "Unknown", "Carian", "Jpan", "Tai_Tham", "Lycian", "Lydian", "Ol_Chiki", "Rejang", "Saurashtra", "SignWriting", "Sundanese", "Moon", "Meetei_Mayek", /* new in ICU 4.0 */ "Imperial_Aramaic", "Avestan", "Chakma", "Kore", "Kaithi", "Manichaean", "Inscriptional_Pahlavi", "Psalter_Pahlavi", "Phlv", "Inscriptional_Parthian", "Samaritan", "Tai_Viet", "Zmth", "Zsym", /* new in ICU 4.4 */ "Bamum", "Lisu", "Nkgb", "Old_South_Arabian", /* new in ICU 4.6 */ "Bassa_Vah", "Duployan", "Elbasan", "Grantha", "Kpel", "Loma", "Mende_Kikakui", "Meroitic_Cursive", "Old_North_Arabian", "Nabataean", "Palmyrene", "Khudawadi", "Warang_Citi", /* new in ICU 4.8 */ "Afak", "Jurc", "Mro", "Nshu", "Sharada", "Sora_Sompeng", "Takri", "Tangut", "Wole", /* new in ICU 49 */ "Anatolian_Hieroglyphs", "Khojki", "Tirhuta", /* new in ICU 52 */ "Caucasian_Albanian", "Mahajani", /* new in ICU 54 */ "Ahom", "Hatran", "Modi", "Multani", "Pau_Cin_Hau", "Siddham", // new in ICU 58 "Adlam", "Bhaiksuki", "Marchen", "Newa", "Osage", "Hanb", "Jamo", "Zsye" }; static const char* expectedShort[] = { "Bali", "Batk", "Blis", "Brah", "Cham", "Cirt", "Cyrs", "Egyd", "Egyh", "Egyp", "Geok", "Hans", "Hant", "Hmng", "Hung", "Inds", "Java", "Kali", "Latf", "Latg", "Lepc", "Lina", "Mand", "Maya", "Mero", "Nkoo", "Orkh", "Perm", "Phag", "Phnx", "Plrd", "Roro", "Sara", "Syre", "Syrj", "Syrn", "Teng", "Vaii", "Visp", "Xsux", "Zxxx", "Zzzz", "Cari", "Jpan", "Lana", "Lyci", "Lydi", "Olck", "Rjng", "Saur", "Sgnw", "Sund", "Moon", "Mtei", /* new in ICU 4.0 */ "Armi", "Avst", "Cakm", "Kore", "Kthi", "Mani", "Phli", "Phlp", "Phlv", "Prti", "Samr", "Tavt", "Zmth", "Zsym", /* new in ICU 4.4 */ "Bamu", "Lisu", "Nkgb", "Sarb", /* new in ICU 4.6 */ "Bass", "Dupl", "Elba", "Gran", "Kpel", "Loma", "Mend", "Merc", "Narb", "Nbat", "Palm", "Sind", "Wara", /* new in ICU 4.8 */ "Afak", "Jurc", "Mroo", "Nshu", "Shrd", "Sora", "Takr", "Tang", "Wole", /* new in ICU 49 */ "Hluw", "Khoj", "Tirh", /* new in ICU 52 */ "Aghb", "Mahj", /* new in ICU 54 */ "Ahom", "Hatr", "Modi", "Mult", "Pauc", "Sidd", // new in ICU 58 "Adlm", "Bhks", "Marc", "Newa", "Osge", "Hanb", "Jamo", "Zsye" }; int32_t j = 0; if(UPRV_LENGTHOF(expectedLong)!=(USCRIPT_CODE_LIMIT-USCRIPT_BALINESE)) { log_err("need to add new script codes in cucdapi.c!\n"); return; } for(i=USCRIPT_BALINESE; (UScriptCode)i<USCRIPT_CODE_LIMIT; i++, j++){ const char* name = uscript_getName((UScriptCode)i); if(name==NULL || strcmp(name,expectedLong[j])!=0){ log_err("uscript_getName failed for code %i: %s!=%s\n", i, name, expectedLong[j]); } name = uscript_getShortName((UScriptCode)i); if(name==NULL || strcmp(name,expectedShort[j])!=0){ log_err("uscript_getShortName failed for code %i: %s!=%s\n", i, name, expectedShort[j]); } } for(i=0; i<UPRV_LENGTHOF(expectedLong); i++){ UScriptCode fillIn[5] = {USCRIPT_INVALID_CODE}; UErrorCode status = U_ZERO_ERROR; int32_t len = 0; len = uscript_getCode(expectedShort[i], fillIn, UPRV_LENGTHOF(fillIn), &status); if(U_FAILURE(status)){ log_err("uscript_getCode failed for script name %s. Error: %s\n",expectedShort[i], u_errorName(status)); } if(len>1){ log_err("uscript_getCode did not return expected number of codes for script %s. EXPECTED: 1 GOT: %i\n", expectedShort[i], len); } if(fillIn[0]!= (UScriptCode)(USCRIPT_BALINESE+i)){ log_err("uscript_getCode did not return expected code for script %s. EXPECTED: %i GOT: %i\n", expectedShort[i], (USCRIPT_BALINESE+i), fillIn[0] ); } } } { /* test characters which have Script_Extensions */ UErrorCode errorCode=U_ZERO_ERROR; if(!( USCRIPT_COMMON==uscript_getScript(0x0640, &errorCode) && USCRIPT_INHERITED==uscript_getScript(0x0650, &errorCode) && USCRIPT_ARABIC==uscript_getScript(0xfdf2, &errorCode)) || U_FAILURE(errorCode) ) { log_err("uscript_getScript(character with Script_Extensions) failed\n"); } } }