ERL_NIF_TERM trans_ids(ErlNifEnv* env, int argc, const ERL_NIF_TERM /*argv*/[]) { ERL_NIF_TERM out; UEnumeration* en; UErrorCode status = U_ZERO_ERROR; if (argc != 0) return enif_make_badarg(env); en = utrans_openIDs(&status); CHECK(env, status); out = enum_to_term(env, en); uenum_close(en); return out; }
static void print_icu_transliterators(const struct config_t *p_config) { UErrorCode status; UEnumeration *en = utrans_openIDs(&status); int32_t count = uenum_count(en, &status); const char *name; int32_t length; if (p_config->xmloutput) fprintf(p_config->outfile, "<transliterators count=\"%d\">\n", count); else fprintf(p_config->outfile, "Available ICU transliterators: %d\n", count); while ((name = uenum_next(en, &length, &status))) { if (p_config->xmloutput) fprintf(p_config->outfile, "<transliterator id=\"%s\"/>\n", name); else fprintf(p_config->outfile, "%s\n", name); } uenum_close(en); if (p_config->xmloutput) fprintf(p_config->outfile, "</transliterators>\n"); else { fprintf(p_config->outfile, "\n\nUnicode Set Patterns:\n" " Pattern Description\n" " Ranges [a-z] The lower case letters a through z\n" " Named Chars [abc123] The six characters a,b,c,1,2 and 3\n" " String [abc{def}] chars a, b and c, and string 'def'\n" " Categories [\\p{Letter}] Perl General Category 'Letter'.\n" " Categories [:Letter:] Posix General Category 'Letter'.\n" "\n" " Combination Example\n" " Union [[:Greek:] [:letter:]]\n" " Intersection [[:Greek:] & [:letter:]]\n" " Set Complement [[:Greek:] - [:letter:]]\n" " Complement [^[:Greek:] [:letter:]]\n" "\n" "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n" "\n" "Examples:\n" " [:Punctuation:] Any-Remove\n" " [:Cased-Letter:] Any-Upper\n" " [:Control:] Any-Remove\n" " [:Decimal_Number:] Any-Remove\n" " [:Final_Punctuation:] Any-Remove\n" " [:Georgian:] Any-Upper\n" " [:Katakana:] Any-Remove\n" " [:Arabic:] Any-Remove\n" " [:Punctuation:] Remove\n" " [[:Punctuation:]-[.,]] Remove\n" " [:Line_Separator:] Any-Remove\n" " [:Math_Symbol:] Any-Remove\n" " Lower; [:^Letter:] Remove (word tokenization)\n" " [:^Number:] Remove (numeric tokenization)\n" " [:^Katagana:] Remove (remove everything except Katagana)\n" " Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n" " NFD; [:Nonspacing Mark:] Remove; NFC (removes accents from characters)\n" " [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n" " [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n" "\n" "see http://userguide.icu-project.org/transforms/general\n" " http://www.unicode.org/reports/tr44/\n" ); fprintf(p_config->outfile, "\n\n"); } }
static void TestUnicodeIDs() { UEnumeration *uenum; UTransliterator *utrans; const UChar *id, *id2; int32_t idLength, id2Length, count, count2; UErrorCode errorCode; errorCode=U_ZERO_ERROR; uenum=utrans_openIDs(&errorCode); if(U_FAILURE(errorCode)) { log_err("utrans_openIDs() failed - %s\n", u_errorName(errorCode)); return; } count=uenum_count(uenum, &errorCode); if(U_FAILURE(errorCode) || count<1) { log_err("uenum_count(transliterator IDs)=%d - %s\n", count, u_errorName(errorCode)); } count=0; for(;;) { id=uenum_unext(uenum, &idLength, &errorCode); if(U_FAILURE(errorCode)) { log_err("uenum_unext(transliterator ID %d) failed - %s\n", count, u_errorName(errorCode)); break; } if(id==NULL) { break; } if(++count>10) { /* try to actually open only a few transliterators */ continue; } utrans=utrans_openU(id, idLength, UTRANS_FORWARD, NULL, 0, NULL, &errorCode); if(U_FAILURE(errorCode)) { log_err("utrans_openU(%s) failed - %s\n", aescstrdup(id, idLength), u_errorName(errorCode)); continue; } id2=utrans_getUnicodeID(utrans, &id2Length); if(idLength!=id2Length || 0!=u_memcmp(id, id2, idLength)) { log_err("utrans_getUnicodeID(%s) does not match the original ID\n", aescstrdup(id, idLength)); } utrans_close(utrans); } uenum_reset(uenum, &errorCode); if(U_FAILURE(errorCode) || count<1) { log_err("uenum_reset(transliterator IDs) failed - %s\n", u_errorName(errorCode)); } else { count2=uenum_count(uenum, &errorCode); if(U_FAILURE(errorCode) || count<1) { log_err("2nd uenum_count(transliterator IDs)=%d - %s\n", count2, u_errorName(errorCode)); } else if(count!=count2) { log_err("uenum_unext(transliterator IDs) returned %d IDs but uenum_count() after uenum_reset() claims there are %d\n", count, count2); } } uenum_close(uenum); }
static inline UTransliterator *utrans_find(CFStringRef transform, UTransDirection dir, UErrorCode *error) { UEnumeration *uenum = NULL; UTransliterator *trans = NULL; do { uenum = utrans_openIDs(error); if (U_FAILURE(*error)) { DEBUG_LOG("%s", u_errorName(*error)); break; } int32_t count = uenum_count(uenum, error); if (U_FAILURE(*error)) { DEBUG_LOG("%s", u_errorName(*error)); break; } int32_t trans_idx = 0; while (trans_idx < count && trans == NULL) { int32_t idLen = 0; const UChar *uid = uenum_unext(uenum, &idLen, error); if (U_FAILURE(*error)) { DEBUG_LOG("%s", u_errorName(*error)); break; } // this seems rather unlikely since we should have already broken // by the trans_idx exceeding the count if (uid == NULL) { break; } CFStringRef name = CFStringCreateWithCharactersNoCopy(kCFAllocatorDefault, uid, idLen, kCFAllocatorNull); // It would have been nice if these stirng constants were actually defined somewhere in icu, but sadly they are runtime metadata... if ((CFEqual(name, CFSTR("Any-Remove")) && CFEqual(transform, kCFStringTransformStripCombiningMarks)) || (CFEqual(name, CFSTR("Any-Latin")) && CFEqual(transform, kCFStringTransformToLatin)) || (CFEqual(name, CFSTR("Latin-Katakana")) && CFEqual(transform, kCFStringTransformLatinKatakana)) || (CFEqual(name, CFSTR("Latin-Hiragana")) && CFEqual(transform, kCFStringTransformLatinHiragana)) || (CFEqual(name, CFSTR("Hiragana-Katakana")) && CFEqual(transform, kCFStringTransformHiraganaKatakana)) || (CFEqual(name, CFSTR("Latin-Hangul")) && CFEqual(transform, kCFStringTransformLatinHangul)) || (CFEqual(name, CFSTR("Latin-Arabic")) && CFEqual(transform, kCFStringTransformLatinArabic)) || (CFEqual(name, CFSTR("Latin-Hebrew")) && CFEqual(transform, kCFStringTransformLatinHebrew)) || (CFEqual(name, CFSTR("Latin-Thai")) && CFEqual(transform, kCFStringTransformLatinThai)) || (CFEqual(name, CFSTR("Latin-Cyrillic")) && CFEqual(transform, kCFStringTransformLatinCyrillic)) || (CFEqual(name, CFSTR("Latin-Greek")) && CFEqual(transform, kCFStringTransformLatinGreek)) || (CFEqual(name, CFSTR("Any-Hex/XML")) && CFEqual(transform, kCFStringTransformToXMLHex)) || (CFEqual(name, CFSTR("Any-Name")) && CFEqual(transform, kCFStringTransformToUnicodeName)) || (CFEqual(name, CFSTR("Accents-Any")) && CFEqual(transform, kCFStringTransformStripDiacritics))) { trans = utrans_openU(uid, idLen, dir, NULL, 0, NULL, error); } CFRelease(name); trans_idx++; } } while (0); if (uenum != NULL) { uenum_reset(uenum, error); uenum_close(uenum); } if (trans == NULL && (CFEqual(transform, kCFStringTransformStripCombiningMarks) || CFEqual(transform, kCFStringTransformToLatin) || CFEqual(transform, kCFStringTransformLatinKatakana) || CFEqual(transform, kCFStringTransformLatinHiragana) || CFEqual(transform, kCFStringTransformHiraganaKatakana) || CFEqual(transform, kCFStringTransformLatinHangul) || CFEqual(transform, kCFStringTransformLatinArabic) || CFEqual(transform, kCFStringTransformLatinHebrew) || CFEqual(transform, kCFStringTransformLatinCyrillic) || CFEqual(transform, kCFStringTransformLatinGreek) || CFEqual(transform, kCFStringTransformToXMLHex) || CFEqual(transform, kCFStringTransformToUnicodeName) || CFEqual(transform, kCFStringTransformStripDiacritics))) { static dispatch_once_t once = 0L; dispatch_once(&once, ^{ RELEASE_LOG("Unable to find transliterators in icu data: likely this is from not including the Transliterators section in building your icu.dat file"); }); }