// Helper removes joiners from strings that contain no letters. static void StripJoiners(std::vector<char32>* str32) { for (char32 ch : *str32) { if (u_isalpha(ch)) return; } int len = 0; for (char32 ch : *str32) { if (ch != Validator::kZeroWidthJoiner && ch != Validator::kZeroWidthNonJoiner) { (*str32)[len++] = ch; } } str32->resize(len); }
static int u_iscclass(PARROT_INTERP, UINTVAL codepoint, INTVAL flags) { ASSERT_ARGS(u_iscclass) #if PARROT_HAS_ICU UNUSED(interp); /* XXX which one return u_charDigitValue(codepoint); */ if ((flags & enum_cclass_uppercase) && u_isupper(codepoint)) return 1; if ((flags & enum_cclass_lowercase) && u_islower(codepoint)) return 1; if ((flags & enum_cclass_alphabetic) && u_isalpha(codepoint)) return 1; if ((flags & enum_cclass_numeric) && u_isdigit(codepoint)) return 1; if ((flags & enum_cclass_hexadecimal) && u_isxdigit(codepoint)) return 1; if ((flags & enum_cclass_whitespace) && u_isspace(codepoint)) return 1; if ((flags & enum_cclass_printing) && u_isprint(codepoint)) return 1; if ((flags & enum_cclass_graphical) && u_isgraph(codepoint)) return 1; if ((flags & enum_cclass_blank) && u_isblank(codepoint)) return 1; if ((flags & enum_cclass_control) && u_iscntrl(codepoint)) return 1; if ((flags & enum_cclass_alphanumeric) && u_isalnum(codepoint)) return 1; if ((flags & enum_cclass_word) && (u_isalnum(codepoint) || codepoint == '_')) return 1; if ((flags & enum_cclass_newline) && (codepoint == 0x2028 || codepoint == 0x2029 || u_hasBinaryProperty(codepoint, UCHAR_LINE_BREAK))) return 1; return 0; #else if (codepoint < 256) return (Parrot_iso_8859_1_typetable[codepoint] & flags) ? 1 : 0; if (flags == enum_cclass_any) return 1; /* All codepoints from u+0100 to u+02af are alphabetic, so we * cheat on the WORD and ALPHABETIC properties to include these * (and incorrectly exclude all others). This is a stopgap until * ICU is everywhere, or we have better non-ICU unicode support. */ if (flags == enum_cclass_word || flags == enum_cclass_alphabetic) return (codepoint < 0x2b0); if (flags & enum_cclass_whitespace) { /* from http://www.unicode.org/Public/UNIDATA/PropList.txt */ switch (codepoint) { case 0x1680: case 0x180e: case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004: case 0x2005: case 0x2006: case 0x2007: case 0x2008: case 0x2009: case 0x200a: case 0x2028: case 0x2029: case 0x202f: case 0x205f: case 0x3000: return 1; default: break; } } if (flags & enum_cclass_numeric) { /* from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt */ if (codepoint >= 0x0660 && codepoint <= 0x0669) return 1; if (codepoint >= 0x06f0 && codepoint <= 0x06f9) return 1; if (codepoint >= 0x07c0 && codepoint <= 0x07c9) return 1; if (codepoint >= 0x0966 && codepoint <= 0x096f) return 1; if (codepoint >= 0x09e6 && codepoint <= 0x09ef) return 1; if (codepoint >= 0x0a66 && codepoint <= 0x0a6f) return 1; if (codepoint >= 0x0ae6 && codepoint <= 0x0aef) return 1; if (codepoint >= 0x0b66 && codepoint <= 0x0b6f) return 1; if (codepoint >= 0x0be6 && codepoint <= 0x0bef) return 1; if (codepoint >= 0x0c66 && codepoint <= 0x0c6f) return 1; if (codepoint >= 0x0ce6 && codepoint <= 0x0cef) return 1; if (codepoint >= 0x0d66 && codepoint <= 0x0d6f) return 1; if (codepoint >= 0x0e50 && codepoint <= 0x0e59) return 1; if (codepoint >= 0x0ed0 && codepoint <= 0x0ed9) return 1; if (codepoint >= 0x0f20 && codepoint <= 0x0f29) return 1; if (codepoint >= 0x1040 && codepoint <= 0x1049) return 1; if (codepoint >= 0x17e0 && codepoint <= 0x17e9) return 1; if (codepoint >= 0x1810 && codepoint <= 0x1819) return 1; if (codepoint >= 0x1946 && codepoint <= 0x194f) return 1; if (codepoint >= 0x19d0 && codepoint <= 0x19d9) return 1; if (codepoint >= 0x1b50 && codepoint <= 0x1b59) return 1; if (codepoint >= 0xff10 && codepoint <= 0xff19) return 1; } if (flags & enum_cclass_newline) { /* from http://www.unicode.org/Public/UNIDATA/extracted/DerivedLineBreak.txt * Line_Break=Mandatory_Break*/ if (codepoint == 0x2028 || codepoint == 0x2029) return 1; } if (flags & ~(enum_cclass_whitespace | enum_cclass_numeric | enum_cclass_newline)) Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_LIBRARY_ERROR, "no ICU lib loaded"); return 0; #endif }
static jboolean Character_isLetterImpl(JNIEnv*, jclass, jint codePoint) { return u_isalpha(codePoint); }
//static jboolean Character_isLetterImpl(JNIEnv*, jclass, jint codePoint) { JNIEXPORT jboolean JNICALL Java_java_lang_Character_isLetterImpl(JNIEnv*, jclass, jint codePoint) { return u_isalpha(codePoint); }
UErrorCode convsample_06() { printf("\n\n==============================================\n" "Sample 06: C: frequency distribution of letters in a UTF-8 document\n"); FILE *f; int32_t count; char inBuf[BUFFERSIZE]; const char *source; const char *sourceLimit; UChar *uBuf; int32_t uBufSize = 0; UConverter *conv; UErrorCode status = U_ZERO_ERROR; uint32_t letters=0, total=0; CharFreqInfo *info; UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */ UChar32 p; uint32_t ie = 0; uint32_t gh = 0; UChar32 l = 0; f = fopen("data06.txt", "r"); if(!f) { fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n"); return U_FILE_ACCESS_ERROR; } info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount); if(!info) { fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount); } /* reset frequencies */ for(p=0;p<charCount;p++) { info[p].codepoint = p; info[p].frequency = 0; } // **************************** START SAMPLE ******************* conv = ucnv_open("utf-8", &status); assert(U_SUCCESS(status)); uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); printf("input bytes %d / min chars %d = %d UChars\n", BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); assert(uBuf!=NULL); // grab another buffer's worth while((!feof(f)) && ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) { // Convert bytes to unicode source = inBuf; sourceLimit = inBuf + count; while(source < sourceLimit) { p = ucnv_getNextUChar(conv, &source, sourceLimit, &status); if(U_FAILURE(status)) { fprintf(stderr, "%s @ %d\n", u_errorName(status), total); status = U_ZERO_ERROR; continue; } U_ASSERT(status); total++; if(u_isalpha(p)) letters++; if((u_tolower(l) == 'i') && (u_tolower(p) == 'e')) ie++; if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127)) gh++; if(p>charCount) { fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p); return U_UNSUPPORTED_ERROR; } info[p].frequency++; l = p; } } fclose(f); ucnv_close(conv); printf("%d letters out of %d total UChars.\n", letters, total); printf("%d ie digraphs, %d gh digraphs.\n", ie, gh); // now, we could sort it.. // qsort(info, charCount, sizeof(info[0]), charfreq_compare); for(p=0;p<charCount;p++) { if(info[p].frequency) { printf("% 5d U+%06X ", info[p].frequency, p); if(p <= 0xFFFF) { prettyPrintUChar((UChar)p); } printf("\n"); } } free(info); // ***************************** END SAMPLE ******************** printf("\n"); return U_ZERO_ERROR; }
UErrorCode convsample_05() { printf("\n\n==============================================\n" "Sample 05: C: count the number of letters in a UTF-8 document\n"); FILE *f; int32_t count; char inBuf[BUFFERSIZE]; const char *source; const char *sourceLimit; UChar *uBuf; UChar *target; UChar *targetLimit; UChar *p; int32_t uBufSize = 0; UConverter *conv; UErrorCode status = U_ZERO_ERROR; uint32_t letters=0, total=0; f = fopen("data01.txt", "r"); if(!f) { fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n"); return U_FILE_ACCESS_ERROR; } // **************************** START SAMPLE ******************* conv = ucnv_open("utf-8", &status); assert(U_SUCCESS(status)); uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); printf("input bytes %d / min chars %d = %d UChars\n", BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); assert(uBuf!=NULL); // grab another buffer's worth while((!feof(f)) && ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) { // Convert bytes to unicode source = inBuf; sourceLimit = inBuf + count; do { target = uBuf; targetLimit = uBuf + uBufSize; ucnv_toUnicode(conv, &target, targetLimit, &source, sourceLimit, NULL, feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ /* is true (when no more data will come) */ &status); if(status == U_BUFFER_OVERFLOW_ERROR) { // simply ran out of space - we'll reset the target ptr the next // time through the loop. status = U_ZERO_ERROR; } else { // Check other errors here. assert(U_SUCCESS(status)); // Break out of the loop (by force) } // Process the Unicode // Todo: handle UTF-16/surrogates for(p = uBuf; p<target; p++) { if(u_isalpha(*p)) letters++; total++; } } while (source < sourceLimit); // while simply out of space } printf("%d letters out of %d total UChars.\n", letters, total); // ***************************** END SAMPLE ******************** ucnv_close(conv); printf("\n"); fclose(f); return U_ZERO_ERROR; }
jboolean fastiva_vm_Character_C$__isLetterImpl(jint codePoint) { return u_isalpha(codePoint); }
static bool IsAlpha(wxChar ch) { return u_isalpha(ch); }
// Helper sets the character attribute properties and sets up the script table. // Does not set tops and bottoms. void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET* unicharset) { for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) { // Convert any custom ligatures. const char* unichar_str = unicharset->id_to_unichar(unichar_id); for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) { if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) { unichar_str = UNICHARSET::kCustomLigatures[i][0]; break; } } // Convert the unichar to UTF32 representation std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(unichar_str); // Assume that if the property is true for any character in the string, // then it holds for the whole "character". bool unichar_isalpha = false; bool unichar_islower = false; bool unichar_isupper = false; bool unichar_isdigit = false; bool unichar_ispunct = false; for (char32 u_ch : uni_vector) { if (u_isalpha(u_ch)) unichar_isalpha = true; if (u_islower(u_ch)) unichar_islower = true; if (u_isupper(u_ch)) unichar_isupper = true; if (u_isdigit(u_ch)) unichar_isdigit = true; if (u_ispunct(u_ch)) unichar_ispunct = true; } unicharset->set_isalpha(unichar_id, unichar_isalpha); unicharset->set_islower(unichar_id, unichar_islower); unicharset->set_isupper(unichar_id, unichar_isupper); unicharset->set_isdigit(unichar_id, unichar_isdigit); unicharset->set_ispunctuation(unichar_id, unichar_ispunct); tesseract::IcuErrorCode err; unicharset->set_script(unichar_id, uscript_getName( uscript_getScript(uni_vector[0], err))); const int num_code_points = uni_vector.size(); // Obtain the lower/upper case if needed and record it in the properties. unicharset->set_other_case(unichar_id, unichar_id); if (unichar_islower || unichar_isupper) { std::vector<char32> other_case(num_code_points, 0); for (int i = 0; i < num_code_points; ++i) { // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used. // However since they deal with UChars (so need a conversion function // from char32 or UTF8string) and require a meaningful locale string, // for now u_tolower()/u_toupper() are used. other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) : u_tolower(uni_vector[i]); } std::string other_case_uch = UNICHAR::UTF32ToUTF8(other_case); UNICHAR_ID other_case_id = unicharset->unichar_to_id(other_case_uch.c_str()); if (other_case_id != INVALID_UNICHAR_ID) { unicharset->set_other_case(unichar_id, other_case_id); } else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) { tprintf("Other case %s of %s is not in unicharset\n", other_case_uch.c_str(), unichar_str); } } // Set RTL property and obtain mirror unichar ID from ICU. std::vector<char32> mirrors(num_code_points, 0); for (int i = 0; i < num_code_points; ++i) { mirrors[i] = u_charMirror(uni_vector[i]); if (i == 0) { // set directionality to that of the 1st code point unicharset->set_direction(unichar_id, static_cast<UNICHARSET::Direction>( u_charDirection(uni_vector[i]))); } } std::string mirror_uch = UNICHAR::UTF32ToUTF8(mirrors); UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str()); if (mirror_uch_id != INVALID_UNICHAR_ID) { unicharset->set_mirror(unichar_id, mirror_uch_id); } else if (report_errors) { tprintf("Mirror %s of %s is not in unicharset\n", mirror_uch.c_str(), unichar_str); } // Record normalized version of this unichar. std::string normed_str; if (unichar_id != 0 && tesseract::NormalizeUTF8String( decompose ? tesseract::UnicodeNormMode::kNFKD : tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize, tesseract::GraphemeNorm::kNone, unichar_str, &normed_str) && !normed_str.empty()) { unicharset->set_normed(unichar_id, normed_str.c_str()); } else { unicharset->set_normed(unichar_id, unichar_str); } ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size()); } unicharset->post_load_setup(); }
// Determines whether the specified code point is a letter character. // True for general categories "L" (letters). bool BUnicodeChar::IsAlpha(uint32 c) { BUnicodeChar(); return u_isalpha(c); }
int32_t GetPhonebookIndex(UCharIterator * iter, const char * locale, UChar * out, int32_t size, UBool * isError) { if (size < MIN_OUTPUT_SIZE) { *isError = TRUE; return 0; } *isError = FALSE; // Normalize the first character to remove accents using the NFD normalization UErrorCode errorCode = U_ZERO_ERROR; int32_t len = unorm_next(iter, out, size, UNORM_NFD, 0 /* options */, TRUE /* normalize */, NULL, &errorCode); if (U_FAILURE(errorCode)) { *isError = TRUE; return 0; } if (len == 0) { // Empty input string return 0; } UChar c = out[0]; // We are only interested in letters if (!u_isalpha(c)) { return 0; } c = u_toupper(c); // Check for explicitly mapped characters UChar c_mapped = map_character(c, DEFAULT_CHAR_MAP, sizeof(DEFAULT_CHAR_MAP) / sizeof(UChar)); if (c_mapped != 0) { out[0] = c_mapped; return 1; } // Convert Kanas to Hiragana UChar next = len > 2 ? out[1] : 0; c = android::GetNormalizedCodePoint(c, next, NULL); // Traditional grouping of Hiragana characters if (0x3042 <= c && c <= 0x309F) { if (c < 0x304B) c = 0x3042; // a else if (c < 0x3055) c = 0x304B; // ka else if (c < 0x305F) c = 0x3055; // sa else if (c < 0x306A) c = 0x305F; // ta else if (c < 0x306F) c = 0x306A; // na else if (c < 0x307E) c = 0x306F; // ha else if (c < 0x3084) c = 0x307E; // ma else if (c < 0x3089) c = 0x3084; // ya else if (c < 0x308F) c = 0x3089; // ra else c = 0x308F; // wa out[0] = c; return 1; } if (is_CJK(c)) { if (strncmp(locale, "ja", 2) == 0) { // Japanese word meaning "misc" or "other" out[0] = 0x4ED6; return 1; } else { return 0; } } out[0] = c; return 1; }
bool isalpha(uint32_t codepoint) { return u_isalpha(static_cast<UChar32>(codepoint)); }