bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set) { bool all_one_case = true; bool capitalized; bool prev_upper; bool prev_lower; bool first_upper; bool first_lower; bool cur_upper; bool cur_lower; string str8; if (!char_set) { // If cube char_set is missing, use C-locale-dependent functions // on UTF8 characters to determine case properties. first_upper = isupper(str32[0]); first_lower = islower(str32[0]); if (first_upper) capitalized = true; prev_upper = first_upper; prev_lower = islower(str32[0]); for (int c = 1; str32[c] != 0; ++c) { cur_upper = isupper(str32[c]); cur_lower = islower(str32[c]); if ((prev_upper && cur_lower) || (prev_lower && cur_upper)) all_one_case = false; if (cur_upper) capitalized = false; prev_upper = cur_upper; prev_lower = cur_lower; } } else { UNICHARSET *unicharset = char_set->InternalUnicharset(); // Use UNICHARSET functions to determine case properties first_upper = unicharset->get_isupper(char_set->ClassID(str32[0])); first_lower = unicharset->get_islower(char_set->ClassID(str32[0])); if (first_upper) capitalized = true; prev_upper = first_upper; prev_lower = unicharset->get_islower(char_set->ClassID(str32[0])); for (int c = 1; c < StrLen(str32); ++c) { cur_upper = unicharset->get_isupper(char_set->ClassID(str32[c])); cur_lower = unicharset->get_islower(char_set->ClassID(str32[c])); if ((prev_upper && cur_lower) || (prev_lower && cur_upper)) all_one_case = false; if (cur_upper) capitalized = false; prev_upper = cur_upper; prev_lower = cur_lower; } } return all_one_case || capitalized; }
char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set) { if (!char_set) { return NULL; } UNICHARSET *unicharset = char_set->InternalUnicharset(); int len = StrLen(str32); char_32 *lower = new char_32[len + 1]; if (!lower) return NULL; for (int i = 0; i < len; ++i) { char_32 ch = str32[i]; if (ch == INVALID_UNICHAR_ID) { delete[] lower; return NULL; } // convert upper-case characters to lower-case if (unicharset->get_isupper(char_set->ClassID(ch))) { UNICHAR_ID uid_lower = unicharset->get_other_case(char_set->ClassID(ch)); const char_32 *str32_lower = char_set->ClassString(uid_lower); // expect lower-case version of character to be a single character if (!str32_lower || StrLen(str32_lower) != 1) { delete[] lower; return NULL; } lower[i] = str32_lower[0]; } else { lower[i] = ch; } } lower[len] = 0; return lower; }
int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) { int state = 0; int x; for (x = 0; x < word.length(); ++x) { UNICHAR_ID ch_id = word.unichar_id(x); if (unicharset.get_isupper(ch_id)) state = case_state_table[state][1]; else if (unicharset.get_islower(ch_id)) state = case_state_table[state][2]; else if (unicharset.get_isdigit(ch_id)) state = case_state_table[state][3]; else state = case_state_table[state][0]; if (state == -1) return false; } return state != 5; // single lower is bad }
BOOL8 Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) { return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O"); }